351 files changed, 16551 insertions, 5621 deletions
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/bug-report-feature-request.md
index 70e1bba67..5706243bb 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE/bug-report-feature-request.md
@@ -1,4 +1,13 @@
-<!--
+---
+name: Bug Report / Feature Request
+about: Tech support does not belong here. You should only file an issue here if you think you have experienced an actual bug with yuzu or you are requesting a feature you believe would make yuzu better.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+<!---
 Please keep in mind yuzu is EXPERIMENTAL SOFTWARE.
 
 Please read the FAQ:
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..52faafad3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: yuzu Discord
+    url: https://discord.com/invite/u77vRWY
+    about: If you are experiencing an issue with yuzu, and you need tech support, or if you have a general question, try asking in the official yuzu Discord linked here. Piracy is not allowed.
+  - name: Community forums
+    url: https://community.citra-emu.org
+    about: This is an alternative place for tech support, however helpers there are not as active.
diff --git a/.gitmodules b/.gitmodules
index 2ec9dda62..79028bbb5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,9 @@
 [submodule "soundtouch"]
     path = externals/soundtouch
     url = https://github.com/citra-emu/ext-soundtouch.git
+[submodule "libressl"]
+    path = externals/libressl
+    url = https://github.com/citra-emu/ext-libressl-portable.git
 [submodule "discord-rpc"]
     path = externals/discord-rpc
     url = https://github.com/discordapp/discord-rpc.git
@@ -31,3 +34,9 @@
 [submodule "xbyak"]
     path = externals/xbyak
     url = https://github.com/herumi/xbyak.git
+[submodule "externals/libusb"]
+	path = externals/libusb
+	url = https://github.com/ameerj/libusb
+[submodule "opus"]
+	path = externals/opus/opus
+	url = https://github.com/xiph/opus.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61321bf0a..1c0e49c03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11)
+cmake_minimum_required(VERSION 3.15)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
 option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)
 
 option(ENABLE_QT "Enable the Qt frontend" ON)
-CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
+CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)
 
 option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)
 
@@ -151,14 +151,11 @@ macro(yuzu_find_packages)
     #    Cmake Pkg Prefix  Version     Conan Pkg
         "Boost             1.71        boost/1.72.0"
         "Catch2            2.11        catch2/2.11.0"
-        "fmt               6.2         fmt/6.2.0"
-        "OpenSSL           1.1         openssl/1.1.1f"
+        "fmt               7.0         fmt/7.0.1"
     # can't use until https://github.com/bincrafters/community/issues/1173
         #"libzip            1.5         libzip/1.5.2@bincrafters/stable"
         "lz4               1.8         lz4/1.9.2"
         "nlohmann_json     3.7         nlohmann_json/3.7.3"
-    # we need to be careful as the version check might be broken https://github.com/xiph/opus/issues/110
-        "opus              1.3         opus/1.3.1"
         "ZLIB              1.2         zlib/1.2.11"
         "zstd              1.4         zstd/1.4.4"
     )
@@ -214,7 +211,10 @@ if(ENABLE_QT)
 
         set(QT_PREFIX_HINT HINTS "${QT_PREFIX}")
     endif()
-    find_package(Qt5 5.9 COMPONENTS Widgets OpenGL ${QT_PREFIX_HINT})
+    find_package(Qt5 5.9 COMPONENTS Widgets ${QT_PREFIX_HINT})
+    if (YUZU_USE_QT_WEB_ENGINE)
+        find_package(Qt5 COMPONENTS WebEngineCore WebEngineWidgets)
+    endif()
     if (NOT Qt5_FOUND)
         list(APPEND CONAN_REQUIRED_LIBS "qt/5.14.1@bincrafters/stable")
     endif()
@@ -287,7 +287,7 @@ if (CONAN_REQUIRED_LIBS)
     if(ENABLE_QT)
         list(APPEND CMAKE_MODULE_PATH "${CONAN_QT_ROOT_RELEASE}")
         list(APPEND CMAKE_PREFIX_PATH "${CONAN_QT_ROOT_RELEASE}")
-        find_package(Qt5 5.9 REQUIRED COMPONENTS Widgets OpenGL)
+        find_package(Qt5 5.9 REQUIRED COMPONENTS Widgets)
         if (YUZU_USE_QT_WEB_ENGINE)
             find_package(Qt5 REQUIRED COMPONENTS WebEngineCore WebEngineWidgets)
         endif()
@@ -312,15 +312,6 @@ elseif (TARGET Boost::boost)
     add_library(boost ALIAS Boost::boost)
 endif()
 
-if (NOT TARGET OpenSSL::SSL)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL)
-endif()
-if (NOT TARGET OpenSSL::Crypto)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL)
-endif()
-
 if (TARGET sdl2::sdl2)
     # imported from the conan generated sdl2Config.cmake
     set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE)
@@ -338,6 +329,15 @@ elseif(SDL2_FOUND)
     target_link_libraries(SDL2 INTERFACE "${SDL2_LIBRARIES}")
 endif()
 
+# Ensure libusb is properly configured (based on dolphin libusb include)
+include(FindPkgConfig)
+find_package(LibUSB)
+if (NOT LIBUSB_FOUND)
+    add_subdirectory(externals/libusb)
+    set(LIBUSB_LIBRARIES usb)
+endif()
+
+
 # Prefer the -pthread flag on Linux.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/CMakeModules/CopyYuzuQt5Deps.cmake b/CMakeModules/CopyYuzuQt5Deps.cmake
index 2598b9b60..59343b1ca 100644
--- a/CMakeModules/CopyYuzuQt5Deps.cmake
+++ b/CMakeModules/CopyYuzuQt5Deps.cmake
@@ -15,7 +15,6 @@ function(copy_yuzu_Qt5_deps target_dir)
         icuuc*.dll
         Qt5Core$<$<CONFIG:Debug>:d>.*
         Qt5Gui$<$<CONFIG:Debug>:d>.*
-        Qt5OpenGL$<$<CONFIG:Debug>:d>.*
         Qt5Widgets$<$<CONFIG:Debug>:d>.*
     )
 
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 83e4e9df2..311ba1c2e 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -51,6 +51,8 @@ endif()
 # The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR)
 set(VIDEO_CORE "${SRC_DIR}/src/video_core")
 set(HASH_FILES
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
diff --git a/dist/qt_themes/qdarkstyle/style.qss b/dist/qt_themes/qdarkstyle/style.qss
index 7d088a719..2d5c9761f 100644
--- a/dist/qt_themes/qdarkstyle/style.qss
+++ b/dist/qt_themes/qdarkstyle/style.qss
@@ -673,10 +673,6 @@ QTabWidget::pane {
     border-bottom-left-radius: 2px;
 }
 
-QTabWidget::tab-bar {
-    overflow: visible;
-}
-
 QTabBar {
     qproperty-drawBase: 0;
     border-radius: 3px;
diff --git a/dist/yuzu.manifest b/dist/yuzu.manifest
index fd30b656f..038edff23 100644
--- a/dist/yuzu.manifest
+++ b/dist/yuzu.manifest
@@ -1,24 +1,58 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
- <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
-  <security>
-   <requestedPrivileges>
-    <requestedExecutionLevel level="asInvoker" uiAccess="false"/>
-   </requestedPrivileges>
-  </security>
- </trustInfo>
- <application xmlns="urn:schemas-microsoft-com:asm.v3">
-  <windowsSettings>
-   <dpiAware xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">True/PM</dpiAware>
-   <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
-  </windowsSettings>
- </application>
- <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
-  <application>
-   <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
-   <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
-   <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
-   <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
-  </application>
- </compatibility>
-</assembly>
-\ No newline at end of file
+<assembly manifestVersion="1.0"
+    xmlns="urn:schemas-microsoft-com:asm.v1"
+    xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+  <asmv3:application>
+    <asmv3:windowsSettings>
+      <!-- Windows 7/8/8.1/10 -->
+      <dpiAware
+        xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">
+        true/pm
+      </dpiAware>
+      <!-- Windows 10, version 1607 or later -->
+      <dpiAwareness
+        xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        PerMonitorV2
+      </dpiAwareness>
+      <!-- Windows 10, version 1703 or later -->
+      <gdiScaling
+          xmlns="http://schemas.microsoft.com/SMI/2017/WindowsSettings">
+        true
+      </gdiScaling>
+      <ws2:longPathAware
+          xmlns:ws3="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        true
+      </ws2:longPathAware>
+    </asmv3:windowsSettings>
+  </asmv3:application>
+  <compatibility
+      xmlns="urn:schemas-microsoft-com:compatibility.v1">
+    <application>
+      <!-- Windows 10 -->
+      <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+      <!-- Windows 8.1 -->
+      <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+      <!-- Windows 8 -->
+      <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+      <!-- Windows 7 -->
+      <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+    </application>
+  </compatibility>
+  <trustInfo
+      xmlns="urn:schemas-microsoft-com:asm.v3">
+    <security>
+      <requestedPrivileges>
+        <!--
+          UAC settings:
+          - app should run at same integrity level as calling process
+          - app does not need to manipulate windows belonging to
+            higher-integrity-level processes
+          -->
+        <requestedExecutionLevel
+            level="asInvoker"
+            uiAccess="false"
+        />
+      </requestedPrivileges>
+    </security>
+  </trustInfo>
+</assembly>
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index df7a5e0a9..d1dcc403b 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
 include(DownloadExternals)
 
+# xbyak
+if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+    add_library(xbyak INTERFACE)
+    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+endif()
+
 # Catch
 add_library(catch-single-include INTERFACE)
 target_include_directories(catch-single-include INTERFACE catch/single_include)
@@ -66,6 +73,15 @@ if (NOT LIBZIP_FOUND)
 endif()
 
 if (ENABLE_WEB_SERVICE)
+    # LibreSSL
+    set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
+    add_subdirectory(libressl EXCLUDE_FROM_ALL)
+    target_include_directories(ssl INTERFACE ./libressl/include)
+    target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP)
+    get_directory_property(OPENSSL_LIBRARIES
+        DIRECTORY libressl
+        DEFINITION OPENSSL_LIBS)
+
     # lurlparser
     add_subdirectory(lurlparser EXCLUDE_FROM_ALL)
 
@@ -73,13 +89,8 @@ if (ENABLE_WEB_SERVICE)
     add_library(httplib INTERFACE)
     target_include_directories(httplib INTERFACE ./httplib)
     target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
-    target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
+    target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES})
 endif()
 
-if (NOT TARGET xbyak)
-    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
-        add_library(xbyak INTERFACE)
-        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
-        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
-    endif()
-endif()
+# Opus
+add_subdirectory(opus)
diff --git a/externals/Vulkan-Headers b/externals/Vulkan-Headers
-Subproject 9250d5ae8f50202005233dc0512a1d460c8b483
+Subproject 8188e3fbbc105591064093440f88081fb957d4f
diff --git a/externals/dynarmic b/externals/dynarmic
-Subproject e7166e8ba74d7b9c85e87afc0aaf667e7e84cfe
+Subproject 4f967387c07365b7ea35d2fa3e19b7df8872a09
diff --git a/externals/find-modules/FindLibUSB.cmake b/externals/find-modules/FindLibUSB.cmake
new file mode 100644
index 000000000..dec0b98b0
--- /dev/null
+++ b/externals/find-modules/FindLibUSB.cmake
@@ -0,0 +1,43 @@
+# - Find libusb-1.0 library
+# This module defines
+#  LIBUSB_INCLUDE_DIR, where to find bluetooth.h
+#  LIBUSB_LIBRARIES, the libraries needed to use libusb-1.0.
+#  LIBUSB_FOUND, If false, do not try to use libusb-1.0.
+#
+# Copyright (c) 2009, Michal Cihar, <michal@cihar.com>
+#
+# vim: expandtab sw=4 ts=4 sts=4:
+
+if(ANDROID)
+       set(LIBUSB_FOUND FALSE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "libusb-1.0 not found.")
+elseif (NOT LIBUSB_FOUND)
+    pkg_check_modules (LIBUSB_PKG libusb-1.0)
+
+    find_path(LIBUSB_INCLUDE_DIR NAMES libusb.h
+       PATHS
+       ${LIBUSB_PKG_INCLUDE_DIRS}
+       /usr/include/libusb-1.0
+       /usr/include
+       /usr/local/include/libusb-1.0
+       /usr/local/include
+    )
+
+    find_library(LIBUSB_LIBRARIES NAMES usb-1.0 usb
+       PATHS
+       ${LIBUSB_PKG_LIBRARY_DIRS}
+       /usr/lib
+       /usr/local/lib
+    )
+
+    if(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+       set(LIBUSB_FOUND TRUE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "Found libusb-1.0: ${LIBUSB_INCLUDE_DIR}, ${LIBUSB_LIBRARIES}")
+    else(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+       set(LIBUSB_FOUND FALSE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "libusb-1.0 not found.")
+    endif(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+
+    mark_as_advanced(LIBUSB_INCLUDE_DIR LIBUSB_LIBRARIES)
+endif ()
+
diff --git a/externals/libressl b/externals/libressl
new file mode 160000
+Subproject 7d01cb01cb1a926ecb4c9c98b107ef3c26f59df
diff --git a/externals/libusb b/externals/libusb
new file mode 160000
+Subproject 3406d72cda879f8792a88bf5f6bd0b7a65636f7
diff --git a/externals/opus/CMakeLists.txt b/externals/opus/CMakeLists.txt
new file mode 100644
index 000000000..94a86551f
--- /dev/null
+++ b/externals/opus/CMakeLists.txt
@@ -0,0 +1,254 @@
+cmake_minimum_required(VERSION 3.8)
+
+project(opus)
+
+option(OPUS_STACK_PROTECTOR "Use stack protection" OFF)
+option(OPUS_USE_ALLOCA "Use alloca for stack arrays (on non-C99 compilers)" OFF)
+option(OPUS_CUSTOM_MODES "Enable non-Opus modes, e.g. 44.1 kHz & 2^n frames" OFF)
+option(OPUS_FIXED_POINT "Compile as fixed-point (for machines without a fast enough FPU)" OFF)
+option(OPUS_ENABLE_FLOAT_API "Compile with the floating point API (for machines with float library" ON)
+
+include(opus/opus_functions.cmake)
+
+if(OPUS_STACK_PROTECTOR)
+    if(NOT MSVC) # GC on by default on MSVC
+        check_and_set_flag(STACK_PROTECTION_STRONG -fstack-protector-strong)
+    endif()
+else()
+    if(MSVC)
+        check_and_set_flag(BUFFER_SECURITY_CHECK /GS-)
+    endif()
+endif()
+
+add_library(opus STATIC
+    # CELT sources
+    opus/celt/bands.c
+    opus/celt/celt.c
+    opus/celt/celt_decoder.c
+    opus/celt/celt_encoder.c
+    opus/celt/celt_lpc.c
+    opus/celt/cwrs.c
+    opus/celt/entcode.c
+    opus/celt/entdec.c
+    opus/celt/entenc.c
+    opus/celt/kiss_fft.c
+    opus/celt/laplace.c
+    opus/celt/mathops.c
+    opus/celt/mdct.c
+    opus/celt/modes.c
+    opus/celt/pitch.c
+    opus/celt/quant_bands.c
+    opus/celt/rate.c
+    opus/celt/vq.c
+
+    # SILK sources
+    opus/silk/A2NLSF.c
+    opus/silk/CNG.c
+    opus/silk/HP_variable_cutoff.c
+    opus/silk/LPC_analysis_filter.c
+    opus/silk/LPC_fit.c
+    opus/silk/LPC_inv_pred_gain.c
+    opus/silk/LP_variable_cutoff.c
+    opus/silk/NLSF2A.c
+    opus/silk/NLSF_VQ.c
+    opus/silk/NLSF_VQ_weights_laroia.c
+    opus/silk/NLSF_decode.c
+    opus/silk/NLSF_del_dec_quant.c
+    opus/silk/NLSF_encode.c
+    opus/silk/NLSF_stabilize.c
+    opus/silk/NLSF_unpack.c
+    opus/silk/NSQ.c
+    opus/silk/NSQ_del_dec.c
+    opus/silk/PLC.c
+    opus/silk/VAD.c
+    opus/silk/VQ_WMat_EC.c
+    opus/silk/ana_filt_bank_1.c
+    opus/silk/biquad_alt.c
+    opus/silk/bwexpander.c
+    opus/silk/bwexpander_32.c
+    opus/silk/check_control_input.c
+    opus/silk/code_signs.c
+    opus/silk/control_SNR.c
+    opus/silk/control_audio_bandwidth.c
+    opus/silk/control_codec.c
+    opus/silk/dec_API.c
+    opus/silk/decode_core.c
+    opus/silk/decode_frame.c
+    opus/silk/decode_indices.c
+    opus/silk/decode_parameters.c
+    opus/silk/decode_pitch.c
+    opus/silk/decode_pulses.c
+    opus/silk/decoder_set_fs.c
+    opus/silk/enc_API.c
+    opus/silk/encode_indices.c
+    opus/silk/encode_pulses.c
+    opus/silk/gain_quant.c
+    opus/silk/init_decoder.c
+    opus/silk/init_encoder.c
+    opus/silk/inner_prod_aligned.c
+    opus/silk/interpolate.c
+    opus/silk/lin2log.c
+    opus/silk/log2lin.c
+    opus/silk/pitch_est_tables.c
+    opus/silk/process_NLSFs.c
+    opus/silk/quant_LTP_gains.c
+    opus/silk/resampler.c
+    opus/silk/resampler_down2.c
+    opus/silk/resampler_down2_3.c
+    opus/silk/resampler_private_AR2.c
+    opus/silk/resampler_private_IIR_FIR.c
+    opus/silk/resampler_private_down_FIR.c
+    opus/silk/resampler_private_up2_HQ.c
+    opus/silk/resampler_rom.c
+    opus/silk/shell_coder.c
+    opus/silk/sigm_Q15.c
+    opus/silk/sort.c
+    opus/silk/stereo_LR_to_MS.c
+    opus/silk/stereo_MS_to_LR.c
+    opus/silk/stereo_decode_pred.c
+    opus/silk/stereo_encode_pred.c
+    opus/silk/stereo_find_predictor.c
+    opus/silk/stereo_quant_pred.c
+    opus/silk/sum_sqr_shift.c
+    opus/silk/table_LSF_cos.c
+    opus/silk/tables_LTP.c
+    opus/silk/tables_NLSF_CB_NB_MB.c
+    opus/silk/tables_NLSF_CB_WB.c
+    opus/silk/tables_gain.c
+    opus/silk/tables_other.c
+    opus/silk/tables_pitch_lag.c
+    opus/silk/tables_pulses_per_block.c
+
+    # Opus sources
+    opus/src/analysis.c
+    opus/src/mapping_matrix.c
+    opus/src/mlp.c
+    opus/src/mlp_data.c
+    opus/src/opus.c
+    opus/src/opus_decoder.c
+    opus/src/opus_encoder.c
+    opus/src/opus_multistream.c
+    opus/src/opus_multistream_decoder.c
+    opus/src/opus_multistream_encoder.c
+    opus/src/opus_projection_decoder.c
+    opus/src/opus_projection_encoder.c
+    opus/src/repacketizer.c
+)
+
+if (DEBUG)
+    target_sources(opus PRIVATE opus/silk/debug.c)
+endif()
+
+if (OPUS_FIXED_POINT)
+    target_sources(opus PRIVATE
+        opus/silk/fixed/LTP_analysis_filter_FIX.c
+        opus/silk/fixed/LTP_scale_ctrl_FIX.c
+        opus/silk/fixed/apply_sine_window_FIX.c
+        opus/silk/fixed/autocorr_FIX.c
+        opus/silk/fixed/burg_modified_FIX.c
+        opus/silk/fixed/corrMatrix_FIX.c
+        opus/silk/fixed/encode_frame_FIX.c
+        opus/silk/fixed/find_LPC_FIX.c
+        opus/silk/fixed/find_LTP_FIX.c
+        opus/silk/fixed/find_pitch_lags_FIX.c
+        opus/silk/fixed/find_pred_coefs_FIX.c
+        opus/silk/fixed/k2a_FIX.c
+        opus/silk/fixed/k2a_Q16_FIX.c
+        opus/silk/fixed/noise_shape_analysis_FIX.c
+        opus/silk/fixed/pitch_analysis_core_FIX.c
+        opus/silk/fixed/prefilter_FIX.c
+        opus/silk/fixed/process_gains_FIX.c
+        opus/silk/fixed/regularize_correlations_FIX.c
+        opus/silk/fixed/residual_energy16_FIX.c
+        opus/silk/fixed/residual_energy_FIX.c
+        opus/silk/fixed/schur64_FIX.c
+        opus/silk/fixed/schur_FIX.c
+        opus/silk/fixed/solve_LS_FIX.c
+        opus/silk/fixed/vector_ops_FIX.c
+        opus/silk/fixed/warped_autocorrelation_FIX.c
+    )
+else()
+    target_sources(opus PRIVATE
+        opus/silk/float/LPC_analysis_filter_FLP.c
+        opus/silk/float/LPC_inv_pred_gain_FLP.c
+        opus/silk/float/LTP_analysis_filter_FLP.c
+        opus/silk/float/LTP_scale_ctrl_FLP.c
+        opus/silk/float/apply_sine_window_FLP.c
+        opus/silk/float/autocorrelation_FLP.c
+        opus/silk/float/burg_modified_FLP.c
+        opus/silk/float/bwexpander_FLP.c
+        opus/silk/float/corrMatrix_FLP.c
+        opus/silk/float/encode_frame_FLP.c
+        opus/silk/float/energy_FLP.c
+        opus/silk/float/find_LPC_FLP.c
+        opus/silk/float/find_LTP_FLP.c
+        opus/silk/float/find_pitch_lags_FLP.c
+        opus/silk/float/find_pred_coefs_FLP.c
+        opus/silk/float/inner_product_FLP.c
+        opus/silk/float/k2a_FLP.c
+        opus/silk/float/noise_shape_analysis_FLP.c
+        opus/silk/float/pitch_analysis_core_FLP.c
+        opus/silk/float/process_gains_FLP.c
+        opus/silk/float/regularize_correlations_FLP.c
+        opus/silk/float/residual_energy_FLP.c
+        opus/silk/float/scale_copy_vector_FLP.c
+        opus/silk/float/scale_vector_FLP.c
+        opus/silk/float/schur_FLP.c
+        opus/silk/float/sort_FLP.c
+        opus/silk/float/warped_autocorrelation_FLP.c
+        opus/silk/float/wrappers_FLP.c
+    )
+endif()
+
+target_compile_definitions(opus PRIVATE OPUS_BUILD ENABLE_HARDENING)
+
+if(NOT MSVC)
+    if(MINGW)
+        target_compile_definitions(opus PRIVATE _FORTIFY_SOURCE=0)
+    else()
+        target_compile_definitions(opus PRIVATE _FORTIFY_SOURCE=2)
+    endif()
+endif()
+
+# It is strongly recommended to uncomment one of these VAR_ARRAYS: Use C99
+# variable-length arrays for stack allocation USE_ALLOCA: Use alloca() for stack
+# allocation If none is defined, then the fallback is a non-threadsafe global
+# array
+if(OPUS_USE_ALLOCA OR MSVC)
+    target_compile_definitions(opus PRIVATE USE_ALLOCA)
+else()
+    target_compile_definitions(opus PRIVATE VAR_ARRAYS)
+endif()
+
+if(OPUS_CUSTOM_MODES)
+    target_compile_definitions(opus PRIVATE CUSTOM_MODES)
+endif()
+
+if(NOT OPUS_ENABLE_FLOAT_API)
+    target_compile_definitions(opus PRIVATE DISABLE_FLOAT_API)
+endif()
+
+target_compile_definitions(opus
+PUBLIC
+    -DOPUS_VERSION="\\"1.3.1\\""
+
+PRIVATE
+    # Use C99 intrinsics to speed up float-to-int conversion
+    HAVE_LRINTF
+)
+
+if (FIXED_POINT)
+    target_compile_definitions(opus PRIVATE -DFIXED_POINT=1 -DDISABLE_FLOAT_API)
+endif()
+
+target_include_directories(opus
+PUBLIC
+    opus/include
+
+PRIVATE
+    opus/celt
+    opus/silk
+    opus/silk/fixed
+    opus/silk/float
+    opus/src
+)
diff --git a/externals/opus/opus b/externals/opus/opus
new file mode 160000
+Subproject ad8fe90db79b7d2a135e3dfd2ed6631b0c5662a
diff --git a/externals/sirit b/externals/sirit
-Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c
+Subproject eefca56afd49379bdebc97ded8b480839f93088
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3a57356ab..1e977e8a8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,6 +62,10 @@ else()
         -Wno-unused-parameter
     )
 
+    if (ARCHITECTURE_x86_64)
+        add_compile_options("-mcx16")
+    endif()
+
     if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)
         add_compile_options("-stdlib=libc++")
     endif()
diff --git a/src/audio_core/audio_renderer.cpp b/src/audio_core/audio_renderer.cpp
index 50846a854..d64452617 100644
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -180,11 +180,12 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<
 
     // Copy output header
     UpdateDataHeader response_data{worker_params};
-    std::vector<u8> output_params(response_data.total_size);
     if (behavior_info.IsElapsedFrameCountSupported()) {
-        response_data.frame_count = 0x10;
-        response_data.total_size += 0x10;
+        response_data.render_info = sizeof(RendererInfo);
+        response_data.total_size += sizeof(RendererInfo);
     }
+
+    std::vector<u8> output_params(response_data.total_size);
     std::memcpy(output_params.data(), &response_data, sizeof(UpdateDataHeader));
 
     // Copy output memory pool entries
@@ -219,6 +220,17 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<
         return Audren::ERR_INVALID_PARAMETERS;
     }
 
+    if (behavior_info.IsElapsedFrameCountSupported()) {
+        const std::size_t renderer_info_offset{
+            sizeof(UpdateDataHeader) + response_data.memory_pools_size + response_data.voices_size +
+            response_data.effects_size + response_data.sinks_size +
+            response_data.performance_manager_size + response_data.behavior_size};
+        RendererInfo renderer_info{};
+        renderer_info.elasped_frame_count = elapsed_frame_count;
+        std::memcpy(output_params.data() + renderer_info_offset, &renderer_info,
+                    sizeof(RendererInfo));
+    }
+
     return MakeResult(output_params);
 }
 
@@ -447,6 +459,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
         }
     }
     audio_out->QueueBuffer(stream, tag, std::move(buffer));
+    elapsed_frame_count++;
 }
 
 void AudioRenderer::ReleaseAndQueueBuffers() {
diff --git a/src/audio_core/audio_renderer.h b/src/audio_core/audio_renderer.h
index 1f9114c07..f0b691a86 100644
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -196,6 +196,12 @@ struct EffectOutStatus {
 };
 static_assert(sizeof(EffectOutStatus) == 0x10, "EffectOutStatus is an invalid size");
 
+struct RendererInfo {
+    u64_le elasped_frame_count{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(RendererInfo) == 0x10, "RendererInfo is an invalid size");
+
 struct UpdateDataHeader {
     UpdateDataHeader() {}
 
@@ -209,7 +215,7 @@ struct UpdateDataHeader {
         mixes_size = 0x0;
         sinks_size = config.sink_count * 0x20;
         performance_manager_size = 0x10;
-        frame_count = 0;
+        render_info = 0;
         total_size = sizeof(UpdateDataHeader) + behavior_size + memory_pools_size + voices_size +
                      effects_size + sinks_size + performance_manager_size;
     }
@@ -223,8 +229,8 @@ struct UpdateDataHeader {
     u32_le mixes_size{};
     u32_le sinks_size{};
     u32_le performance_manager_size{};
-    INSERT_PADDING_WORDS(1);
-    u32_le frame_count{};
+    u32_le splitter_size{};
+    u32_le render_info{};
     INSERT_PADDING_WORDS(4);
     u32_le total_size{};
 };
@@ -258,6 +264,7 @@ private:
     std::unique_ptr<AudioOut> audio_out;
     StreamPtr stream;
     Core::Memory::Memory& memory;
+    std::size_t elapsed_frame_count{};
 };
 
 } // namespace AudioCore
diff --git a/src/audio_core/cubeb_sink.cpp b/src/audio_core/cubeb_sink.cpp
index c4e0e30fe..41bf5cd4d 100644
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -193,7 +193,7 @@ long CubebSinkStream::DataCallback(cubeb_stream* stream, void* user_data, const
     const std::size_t samples_to_write = num_channels * num_frames;
     std::size_t samples_written;
 
-    if (Settings::values.enable_audio_stretching) {
+    if (Settings::values.enable_audio_stretching.GetValue()) {
         const std::vector<s16> in{impl->queue.Pop()};
         const std::size_t num_in{in.size() / num_channels};
         s16* const out{reinterpret_cast<s16*>(buffer)};
diff --git a/src/audio_core/stream.cpp b/src/audio_core/stream.cpp
index 4ca98f8ea..aab3e979a 100644
--- a/src/audio_core/stream.cpp
+++ b/src/audio_core/stream.cpp
@@ -38,7 +38,7 @@ Stream::Stream(Core::Timing::CoreTiming& core_timing, u32 sample_rate, Format fo
       sink_stream{sink_stream}, core_timing{core_timing}, name{std::move(name_)} {
 
     release_event = Core::Timing::CreateEvent(
-        name, [this](u64 userdata, s64 cycles_late) { ReleaseActiveBuffer(); });
+        name, [this](u64 userdata, s64 cycles_late) { ReleaseActiveBuffer(cycles_late); });
 }
 
 void Stream::Play() {
@@ -59,15 +59,15 @@ Stream::State Stream::GetState() const {
     return state;
 }
 
-s64 Stream::GetBufferReleaseCycles(const Buffer& buffer) const {
+s64 Stream::GetBufferReleaseNS(const Buffer& buffer) const {
     const std::size_t num_samples{buffer.GetSamples().size() / GetNumChannels()};
-    const auto us =
-        std::chrono::microseconds((static_cast<u64>(num_samples) * 1000000) / sample_rate);
-    return Core::Timing::usToCycles(us);
+    const auto ns =
+        std::chrono::nanoseconds((static_cast<u64>(num_samples) * 1000000000ULL) / sample_rate);
+    return ns.count();
 }
 
 static void VolumeAdjustSamples(std::vector<s16>& samples, float game_volume) {
-    const float volume{std::clamp(Settings::values.volume - (1.0f - game_volume), 0.0f, 1.0f)};
+    const float volume{std::clamp(Settings::Volume() - (1.0f - game_volume), 0.0f, 1.0f)};
 
     if (volume == 1.0f) {
         return;
@@ -80,7 +80,7 @@ static void VolumeAdjustSamples(std::vector<s16>& samples, float game_volume) {
     }
 }
 
-void Stream::PlayNextBuffer() {
+void Stream::PlayNextBuffer(s64 cycles_late) {
     if (!IsPlaying()) {
         // Ensure we are in playing state before playing the next buffer
         sink_stream.Flush();
@@ -105,14 +105,17 @@ void Stream::PlayNextBuffer() {
 
     sink_stream.EnqueueSamples(GetNumChannels(), active_buffer->GetSamples());
 
-    core_timing.ScheduleEvent(GetBufferReleaseCycles(*active_buffer), release_event, {});
+    core_timing.ScheduleEvent(
+        GetBufferReleaseNS(*active_buffer) -
+            (Settings::values.enable_audio_stretching.GetValue() ? 0 : cycles_late),
+        release_event, {});
 }
 
-void Stream::ReleaseActiveBuffer() {
+void Stream::ReleaseActiveBuffer(s64 cycles_late) {
     ASSERT(active_buffer);
     released_buffers.push(std::move(active_buffer));
     release_callback();
-    PlayNextBuffer();
+    PlayNextBuffer(cycles_late);
 }
 
 bool Stream::QueueBuffer(BufferPtr&& buffer) {
diff --git a/src/audio_core/stream.h b/src/audio_core/stream.h
index 1708a4d98..524376257 100644
--- a/src/audio_core/stream.h
+++ b/src/audio_core/stream.h
@@ -90,13 +90,16 @@ public:
 
 private:
     /// Plays the next queued buffer in the audio stream, starting playback if necessary
-    void PlayNextBuffer();
+    void PlayNextBuffer(s64 cycles_late = 0);
 
     /// Releases the actively playing buffer, signalling that it has been completed
-    void ReleaseActiveBuffer();
+    void ReleaseActiveBuffer(s64 cycles_late = 0);
 
     /// Gets the number of core cycles when the specified buffer will be released
-    s64 GetBufferReleaseCycles(const Buffer& buffer) const;
+    s64 GetBufferReleaseNS(const Buffer& buffer) const;
+
+    /// Gets the number of core cycles when the specified buffer will be released
+    s64 GetBufferReleaseNSHostTiming(const Buffer& buffer) const;
 
     u32 sample_rate;                  ///< Sample rate of the stream
     Format format;                    ///< Format of the stream
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 24b7a083c..d120c8d3d 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp
     DEPENDS
       # WARNING! It was too much work to try and make a common location for this list,
       # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
@@ -96,6 +98,8 @@ add_library(common STATIC
     algorithm.h
     alignment.h
     assert.h
+    atomic_ops.cpp
+    atomic_ops.h
     detached_tasks.cpp
     detached_tasks.h
     bit_field.h
@@ -108,6 +112,8 @@ add_library(common STATIC
     common_types.h
     dynamic_library.cpp
     dynamic_library.h
+    fiber.cpp
+    fiber.h
     file_util.cpp
     file_util.h
     hash.h
@@ -141,6 +147,8 @@ add_library(common STATIC
     scm_rev.cpp
     scm_rev.h
     scope_exit.h
+    spin_lock.cpp
+    spin_lock.h
     string_util.cpp
     string_util.h
     swap.h
@@ -161,6 +169,8 @@ add_library(common STATIC
     vector_math.h
     virtual_buffer.cpp
     virtual_buffer.h
+    wall_clock.cpp
+    wall_clock.h
     web_result.h
     zstd_compression.cpp
     zstd_compression.h
@@ -171,12 +181,15 @@ if(ARCHITECTURE_x86_64)
         PRIVATE
             x64/cpu_detect.cpp
             x64/cpu_detect.h
+            x64/native_clock.cpp
+            x64/native_clock.h
             x64/xbyak_abi.h
             x64/xbyak_util.h
     )
 endif()
 
 create_target_directory_groups(common)
+find_package(Boost 1.71 COMPONENTS context headers REQUIRED)
 
-target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
+target_link_libraries(common PUBLIC ${Boost_LIBRARIES} fmt::fmt microprofile)
 target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
diff --git a/src/common/atomic_ops.cpp b/src/common/atomic_ops.cpp
new file mode 100644
index 000000000..1098e21ff
--- /dev/null
+++ b/src/common/atomic_ops.cpp
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/atomic_ops.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace Common {
+
+#if _MSC_VER
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected) {
+    u8 result = _InterlockedCompareExchange8((char*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected) {
+    u16 result = _InterlockedCompareExchange16((short*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected) {
+    u32 result = _InterlockedCompareExchange((long*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected) {
+    u64 result = _InterlockedCompareExchange64((__int64*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected) {
+    return _InterlockedCompareExchange128((__int64*)pointer, value[1], value[0],
+                                          (__int64*)expected.data()) != 0;
+}
+
+#else
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected) {
+    unsigned __int128 value_a;
+    unsigned __int128 expected_a;
+    std::memcpy(&value_a, value.data(), sizeof(u128));
+    std::memcpy(&expected_a, expected.data(), sizeof(u128));
+    return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
+}
+
+#endif
+
+} // namespace Common
diff --git a/src/common/atomic_ops.h b/src/common/atomic_ops.h
new file mode 100644
index 000000000..e6181d521
--- /dev/null
+++ b/src/common/atomic_ops.h
@@ -0,0 +1,17 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Common {
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected);
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected);
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected);
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected);
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected);
+
+} // namespace Common
diff --git a/src/common/fiber.cpp b/src/common/fiber.cpp
new file mode 100644
index 000000000..1c1d09ccb
--- /dev/null
+++ b/src/common/fiber.cpp
@@ -0,0 +1,222 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/fiber.h"
+#if defined(_WIN32) || defined(WIN32)
+#include <windows.h>
+#else
+#include <boost/context/detail/fcontext.hpp>
+#endif
+
+namespace Common {
+
+constexpr std::size_t default_stack_size = 256 * 1024; // 256kb
+
+#if defined(_WIN32) || defined(WIN32)
+
+struct Fiber::FiberImpl {
+    LPVOID handle = nullptr;
+    LPVOID rewind_handle = nullptr;
+};
+
+void Fiber::Start() {
+    ASSERT(previous_fiber != nullptr);
+    previous_fiber->guard.unlock();
+    previous_fiber.reset();
+    entry_point(start_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::OnRewind() {
+    ASSERT(impl->handle != nullptr);
+    DeleteFiber(impl->handle);
+    impl->handle = impl->rewind_handle;
+    impl->rewind_handle = nullptr;
+    rewind_point(rewind_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::FiberStartFunc(void* fiber_parameter) {
+    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    fiber->Start();
+}
+
+void Fiber::RewindStartFunc(void* fiber_parameter) {
+    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    fiber->OnRewind();
+}
+
+Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
+    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
+    impl = std::make_unique<FiberImpl>();
+    impl->handle = CreateFiber(default_stack_size, &FiberStartFunc, this);
+}
+
+Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}
+
+Fiber::~Fiber() {
+    if (released) {
+        return;
+    }
+    // Make sure the Fiber is not being used
+    const bool locked = guard.try_lock();
+    ASSERT_MSG(locked, "Destroying a fiber that's still running");
+    if (locked) {
+        guard.unlock();
+    }
+    DeleteFiber(impl->handle);
+}
+
+void Fiber::Exit() {
+    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
+    if (!is_thread_fiber) {
+        return;
+    }
+    ConvertFiberToThread();
+    guard.unlock();
+    released = true;
+}
+
+void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter) {
+    rewind_point = std::move(rewind_func);
+    rewind_parameter = start_parameter;
+}
+
+void Fiber::Rewind() {
+    ASSERT(rewind_point);
+    ASSERT(impl->rewind_handle == nullptr);
+    impl->rewind_handle = CreateFiber(default_stack_size, &RewindStartFunc, this);
+    SwitchToFiber(impl->rewind_handle);
+}
+
+void Fiber::YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to) {
+    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
+    ASSERT_MSG(to != nullptr, "Next fiber is null!");
+    to->guard.lock();
+    to->previous_fiber = from;
+    SwitchToFiber(to->impl->handle);
+    ASSERT(from->previous_fiber != nullptr);
+    from->previous_fiber->guard.unlock();
+    from->previous_fiber.reset();
+}
+
+std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
+    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
+    fiber->guard.lock();
+    fiber->impl->handle = ConvertThreadToFiber(nullptr);
+    fiber->is_thread_fiber = true;
+    return fiber;
+}
+
+#else
+
+struct Fiber::FiberImpl {
+    alignas(64) std::array<u8, default_stack_size> stack;
+    alignas(64) std::array<u8, default_stack_size> rewind_stack;
+    u8* stack_limit;
+    u8* rewind_stack_limit;
+    boost::context::detail::fcontext_t context;
+    boost::context::detail::fcontext_t rewind_context;
+};
+
+void Fiber::Start(boost::context::detail::transfer_t& transfer) {
+    ASSERT(previous_fiber != nullptr);
+    previous_fiber->impl->context = transfer.fctx;
+    previous_fiber->guard.unlock();
+    previous_fiber.reset();
+    entry_point(start_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::OnRewind([[maybe_unused]] boost::context::detail::transfer_t& transfer) {
+    ASSERT(impl->context != nullptr);
+    impl->context = impl->rewind_context;
+    impl->rewind_context = nullptr;
+    u8* tmp = impl->stack_limit;
+    impl->stack_limit = impl->rewind_stack_limit;
+    impl->rewind_stack_limit = tmp;
+    rewind_point(rewind_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::FiberStartFunc(boost::context::detail::transfer_t transfer) {
+    auto fiber = static_cast<Fiber*>(transfer.data);
+    fiber->Start(transfer);
+}
+
+void Fiber::RewindStartFunc(boost::context::detail::transfer_t transfer) {
+    auto fiber = static_cast<Fiber*>(transfer.data);
+    fiber->OnRewind(transfer);
+}
+
+Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
+    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
+    impl = std::make_unique<FiberImpl>();
+    impl->stack_limit = impl->stack.data();
+    impl->rewind_stack_limit = impl->rewind_stack.data();
+    u8* stack_base = impl->stack_limit + default_stack_size;
+    impl->context =
+        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), FiberStartFunc);
+}
+
+void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter) {
+    rewind_point = std::move(rewind_func);
+    rewind_parameter = start_parameter;
+}
+
+Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}
+
+Fiber::~Fiber() {
+    if (released) {
+        return;
+    }
+    // Make sure the Fiber is not being used
+    const bool locked = guard.try_lock();
+    ASSERT_MSG(locked, "Destroying a fiber that's still running");
+    if (locked) {
+        guard.unlock();
+    }
+}
+
+void Fiber::Exit() {
+
+    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
+    if (!is_thread_fiber) {
+        return;
+    }
+    guard.unlock();
+    released = true;
+}
+
+void Fiber::Rewind() {
+    ASSERT(rewind_point);
+    ASSERT(impl->rewind_context == nullptr);
+    u8* stack_base = impl->rewind_stack_limit + default_stack_size;
+    impl->rewind_context =
+        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), RewindStartFunc);
+    boost::context::detail::jump_fcontext(impl->rewind_context, this);
+}
+
+void Fiber::YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to) {
+    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
+    ASSERT_MSG(to != nullptr, "Next fiber is null!");
+    to->guard.lock();
+    to->previous_fiber = from;
+    auto transfer = boost::context::detail::jump_fcontext(to->impl->context, to.get());
+    ASSERT(from->previous_fiber != nullptr);
+    from->previous_fiber->impl->context = transfer.fctx;
+    from->previous_fiber->guard.unlock();
+    from->previous_fiber.reset();
+}
+
+std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
+    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
+    fiber->guard.lock();
+    fiber->is_thread_fiber = true;
+    return fiber;
+}
+
+#endif
+} // namespace Common
diff --git a/src/common/fiber.h b/src/common/fiber.h
new file mode 100644
index 000000000..dafc1100e
--- /dev/null
+++ b/src/common/fiber.h
@@ -0,0 +1,92 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "common/common_types.h"
+#include "common/spin_lock.h"
+
+#if !defined(_WIN32) && !defined(WIN32)
+namespace boost::context::detail {
+struct transfer_t;
+}
+#endif
+
+namespace Common {
+
+/**
+ * Fiber class
+ * a fiber is a userspace thread with it's own context. They can be used to
+ * implement coroutines, emulated threading systems and certain asynchronous
+ * patterns.
+ *
+ * This class implements fibers at a low level, thus allowing greater freedom
+ * to implement such patterns. This fiber class is 'threadsafe' only one fiber
+ * can be running at a time and threads will be locked while trying to yield to
+ * a running fiber until it yields. WARNING exchanging two running fibers between
+ * threads will cause a deadlock. In order to prevent a deadlock, each thread should
+ * have an intermediary fiber, you switch to the intermediary fiber of the current
+ * thread and then from it switch to the expected fiber. This way you can exchange
+ * 2 fibers within 2 different threads.
+ */
+class Fiber {
+public:
+    Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter);
+    ~Fiber();
+
+    Fiber(const Fiber&) = delete;
+    Fiber& operator=(const Fiber&) = delete;
+
+    Fiber(Fiber&&) = default;
+    Fiber& operator=(Fiber&&) = default;
+
+    /// Yields control from Fiber 'from' to Fiber 'to'
+    /// Fiber 'from' must be the currently running fiber.
+    static void YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to);
+    static std::shared_ptr<Fiber> ThreadToFiber();
+
+    void SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter);
+
+    void Rewind();
+
+    /// Only call from main thread's fiber
+    void Exit();
+
+    /// Changes the start parameter of the fiber. Has no effect if the fiber already started
+    void SetStartParameter(void* new_parameter) {
+        start_parameter = new_parameter;
+    }
+
+private:
+    Fiber();
+
+#if defined(_WIN32) || defined(WIN32)
+    void OnRewind();
+    void Start();
+    static void FiberStartFunc(void* fiber_parameter);
+    static void RewindStartFunc(void* fiber_parameter);
+#else
+    void OnRewind(boost::context::detail::transfer_t& transfer);
+    void Start(boost::context::detail::transfer_t& transfer);
+    static void FiberStartFunc(boost::context::detail::transfer_t transfer);
+    static void RewindStartFunc(boost::context::detail::transfer_t transfer);
+#endif
+
+    struct FiberImpl;
+
+    SpinLock guard{};
+    std::function<void(void*)> entry_point;
+    std::function<void(void*)> rewind_point;
+    void* rewind_parameter{};
+    void* start_parameter{};
+    std::shared_ptr<Fiber> previous_fiber;
+    std::unique_ptr<FiberImpl> impl;
+    bool is_thread_fiber{};
+    bool released{};
+};
+
+} // namespace Common
diff --git a/src/common/memory_detect.cpp b/src/common/memory_detect.cpp
index 3fdc309a2..8cff6ec37 100644
--- a/src/common/memory_detect.cpp
+++ b/src/common/memory_detect.cpp
@@ -9,10 +9,12 @@
 // clang-format on
 #else
 #include <sys/types.h>
-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 #include <sys/sysctl.h>
-#else
+#elif defined(__linux__)
 #include <sys/sysinfo.h>
+#else
+#include <unistd.h>
 #endif
 #endif
 
@@ -38,15 +40,26 @@ static MemoryInfo Detect() {
     // hw and vm are defined in sysctl.h
     // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
     // sysctlbyname(const char *, void *, size_t *, void *, size_t);
-    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
-    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
+    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, nullptr, 0);
+    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, nullptr, 0);
     mem_info.TotalPhysicalMemory = ramsize;
     mem_info.TotalSwapMemory = vmusage.xsu_total;
-#else
+#elif defined(__FreeBSD__)
+    u_long physmem, swap_total;
+    std::size_t sizeof_u_long = sizeof(u_long);
+    // sysctlbyname(const char *, void *, size_t *, const void *, size_t);
+    sysctlbyname("hw.physmem", &physmem, &sizeof_u_long, nullptr, 0);
+    sysctlbyname("vm.swap_total", &swap_total, &sizeof_u_long, nullptr, 0);
+    mem_info.TotalPhysicalMemory = physmem;
+    mem_info.TotalSwapMemory = swap_total;
+#elif defined(__linux__)
     struct sysinfo meminfo;
     sysinfo(&meminfo);
     mem_info.TotalPhysicalMemory = meminfo.totalram;
     mem_info.TotalSwapMemory = meminfo.totalswap;
+#else
+    mem_info.TotalPhysicalMemory = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
+    mem_info.TotalSwapMemory = 0;
 #endif
 
     return mem_info;
diff --git a/src/common/spin_lock.cpp b/src/common/spin_lock.cpp
new file mode 100644
index 000000000..c1524220f
--- /dev/null
+++ b/src/common/spin_lock.cpp
@@ -0,0 +1,54 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/spin_lock.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#if _M_AMD64
+#define __x86_64__ 1
+#endif
+#if _M_ARM64
+#define __aarch64__ 1
+#endif
+#else
+#if __x86_64__
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace {
+
+void ThreadPause() {
+#if __x86_64__
+    _mm_pause();
+#elif __aarch64__ && _MSC_VER
+    __yield();
+#elif __aarch64__
+    asm("yield");
+#endif
+}
+
+} // Anonymous namespace
+
+namespace Common {
+
+void SpinLock::lock() {
+    while (lck.test_and_set(std::memory_order_acquire)) {
+        ThreadPause();
+    }
+}
+
+void SpinLock::unlock() {
+    lck.clear(std::memory_order_release);
+}
+
+bool SpinLock::try_lock() {
+    if (lck.test_and_set(std::memory_order_acquire)) {
+        return false;
+    }
+    return true;
+}
+
+} // namespace Common
diff --git a/src/common/spin_lock.h b/src/common/spin_lock.h
new file mode 100644
index 000000000..1df5528c4
--- /dev/null
+++ b/src/common/spin_lock.h
@@ -0,0 +1,26 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+
+namespace Common {
+
+/**
+ * SpinLock class
+ * a lock similar to mutex that forces a thread to spin wait instead calling the
+ * supervisor. Should be used on short sequences of code.
+ */
+class SpinLock {
+public:
+    void lock();
+    void unlock();
+    bool try_lock();
+
+private:
+    std::atomic_flag lck = ATOMIC_FLAG_INIT;
+};
+
+} // namespace Common
diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp
index 200c6489a..16d42facd 100644
--- a/src/common/telemetry.cpp
+++ b/src/common/telemetry.cpp
@@ -60,6 +60,7 @@ void AppendCPUInfo(FieldCollection& fc) {
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2);
+    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX512", Common::GetCPUCaps().avx512);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI1", Common::GetCPUCaps().bmi1);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI2", Common::GetCPUCaps().bmi2);
     fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_FMA", Common::GetCPUCaps().fma);
diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index 0cd2d10bf..8e5935e6a 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -25,6 +25,52 @@
 
 namespace Common {
 
+#ifdef _WIN32
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    auto handle = GetCurrentThread();
+    int windows_priority = 0;
+    switch (new_priority) {
+    case ThreadPriority::Low:
+        windows_priority = THREAD_PRIORITY_BELOW_NORMAL;
+        break;
+    case ThreadPriority::Normal:
+        windows_priority = THREAD_PRIORITY_NORMAL;
+        break;
+    case ThreadPriority::High:
+        windows_priority = THREAD_PRIORITY_ABOVE_NORMAL;
+        break;
+    case ThreadPriority::VeryHigh:
+        windows_priority = THREAD_PRIORITY_HIGHEST;
+        break;
+    default:
+        windows_priority = THREAD_PRIORITY_NORMAL;
+        break;
+    }
+    SetThreadPriority(handle, windows_priority);
+}
+
+#else
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    pthread_t this_thread = pthread_self();
+
+    s32 max_prio = sched_get_priority_max(SCHED_OTHER);
+    s32 min_prio = sched_get_priority_min(SCHED_OTHER);
+    u32 level = static_cast<u32>(new_priority) + 1;
+
+    struct sched_param params;
+    if (max_prio > min_prio) {
+        params.sched_priority = min_prio + ((max_prio - min_prio) * level) / 4;
+    } else {
+        params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4;
+    }
+
+    pthread_setschedparam(this_thread, SCHED_OTHER, &params);
+}
+
+#endif
+
 #ifdef _MSC_VER
 
 // Sets the debugger-visible name of the current thread.
@@ -70,6 +116,12 @@ void SetCurrentThreadName(const char* name) {
 }
 #endif
 
+#if defined(_WIN32)
+void SetCurrentThreadName(const char* name) {
+    // Do Nothing on MingW
+}
+#endif
+
 #endif
 
 } // namespace Common
diff --git a/src/common/thread.h b/src/common/thread.h
index 2fc071685..52b359413 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -9,6 +9,7 @@
 #include <cstddef>
 #include <mutex>
 #include <thread>
+#include "common/common_types.h"
 
 namespace Common {
 
@@ -28,8 +29,7 @@ public:
         is_set = false;
     }
 
-    template <class Duration>
-    bool WaitFor(const std::chrono::duration<Duration>& time) {
+    bool WaitFor(const std::chrono::nanoseconds& time) {
         std::unique_lock lk{mutex};
         if (!condvar.wait_for(lk, time, [this] { return is_set; }))
             return false;
@@ -86,6 +86,15 @@ private:
     std::size_t generation = 0; // Incremented once each time the barrier is used
 };
 
+enum class ThreadPriority : u32 {
+    Low = 0,
+    Normal = 1,
+    High = 2,
+    VeryHigh = 3,
+};
+
+void SetCurrentThreadPriority(ThreadPriority new_priority);
+
 void SetCurrentThreadName(const char* name);
 
 } // namespace Common
diff --git a/src/common/uint128.cpp b/src/common/uint128.cpp
index 32bf56730..16bf7c828 100644
--- a/src/common/uint128.cpp
+++ b/src/common/uint128.cpp
@@ -6,12 +6,38 @@
 #include <intrin.h>
 
 #pragma intrinsic(_umul128)
+#pragma intrinsic(_udiv128)
 #endif
 #include <cstring>
 #include "common/uint128.h"
 
 namespace Common {
 
+#ifdef _MSC_VER
+
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {
+    u128 r{};
+    r[0] = _umul128(a, b, &r[1]);
+    u64 remainder;
+#if _MSC_VER < 1923
+    return udiv128(r[1], r[0], d, &remainder);
+#else
+    return _udiv128(r[1], r[0], d, &remainder);
+#endif
+}
+
+#else
+
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {
+    const u64 diva = a / d;
+    const u64 moda = a % d;
+    const u64 divb = b / d;
+    const u64 modb = b % d;
+    return diva * b + moda * divb + moda * modb / d;
+}
+
+#endif
+
 u128 Multiply64Into128(u64 a, u64 b) {
     u128 result;
 #ifdef _MSC_VER
diff --git a/src/common/uint128.h b/src/common/uint128.h
index a3be2a2cb..503cd2d0c 100644
--- a/src/common/uint128.h
+++ b/src/common/uint128.h
@@ -9,6 +9,9 @@
 
 namespace Common {
 
+// This function multiplies 2 u64 values and divides it by a u64 value.
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d);
+
 // This function multiplies 2 u64 values and produces a u128 value;
 u128 Multiply64Into128(u64 a, u64 b);
 
diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp
new file mode 100644
index 000000000..3afbdb898
--- /dev/null
+++ b/src/common/wall_clock.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/uint128.h"
+#include "common/wall_clock.h"
+
+#ifdef ARCHITECTURE_x86_64
+#include "common/x64/cpu_detect.h"
+#include "common/x64/native_clock.h"
+#endif
+
+namespace Common {
+
+using base_timer = std::chrono::steady_clock;
+using base_time_point = std::chrono::time_point<base_timer>;
+
+class StandardWallClock : public WallClock {
+public:
+    StandardWallClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency)
+        : WallClock(emulated_cpu_frequency, emulated_clock_frequency, false) {
+        start_time = base_timer::now();
+    }
+
+    std::chrono::nanoseconds GetTimeNS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed);
+    }
+
+    std::chrono::microseconds GetTimeUS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::microseconds>(elapsed);
+    }
+
+    std::chrono::milliseconds GetTimeMS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::milliseconds>(elapsed);
+    }
+
+    u64 GetClockCycles() override {
+        std::chrono::nanoseconds time_now = GetTimeNS();
+        const u128 temporary =
+            Common::Multiply64Into128(time_now.count(), emulated_clock_frequency);
+        return Common::Divide128On32(temporary, 1000000000).first;
+    }
+
+    u64 GetCPUCycles() override {
+        std::chrono::nanoseconds time_now = GetTimeNS();
+        const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency);
+        return Common::Divide128On32(temporary, 1000000000).first;
+    }
+
+    void Pause(bool is_paused) override {
+        // Do nothing in this clock type.
+    }
+
+private:
+    base_time_point start_time;
+};
+
+#ifdef ARCHITECTURE_x86_64
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency) {
+    const auto& caps = GetCPUCaps();
+    u64 rtsc_frequency = 0;
+    if (caps.invariant_tsc) {
+        rtsc_frequency = EstimateRDTSCFrequency();
+    }
+    if (rtsc_frequency == 0) {
+        return std::make_unique<StandardWallClock>(emulated_cpu_frequency,
+                                                   emulated_clock_frequency);
+    } else {
+        return std::make_unique<X64::NativeClock>(emulated_cpu_frequency, emulated_clock_frequency,
+                                                  rtsc_frequency);
+    }
+}
+
+#else
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency) {
+    return std::make_unique<StandardWallClock>(emulated_cpu_frequency, emulated_clock_frequency);
+}
+
+#endif
+
+} // namespace Common
diff --git a/src/common/wall_clock.h b/src/common/wall_clock.h
new file mode 100644
index 000000000..367d72134
--- /dev/null
+++ b/src/common/wall_clock.h
@@ -0,0 +1,53 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+
+#include "common/common_types.h"
+
+namespace Common {
+
+class WallClock {
+public:
+    /// Returns current wall time in nanoseconds
+    virtual std::chrono::nanoseconds GetTimeNS() = 0;
+
+    /// Returns current wall time in microseconds
+    virtual std::chrono::microseconds GetTimeUS() = 0;
+
+    /// Returns current wall time in milliseconds
+    virtual std::chrono::milliseconds GetTimeMS() = 0;
+
+    /// Returns current wall time in emulated clock cycles
+    virtual u64 GetClockCycles() = 0;
+
+    /// Returns current wall time in emulated cpu cycles
+    virtual u64 GetCPUCycles() = 0;
+
+    virtual void Pause(bool is_paused) = 0;
+
+    /// Tells if the wall clock, uses the host CPU's hardware clock
+    bool IsNative() const {
+        return is_native;
+    }
+
+protected:
+    WallClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, bool is_native)
+        : emulated_cpu_frequency{emulated_cpu_frequency},
+          emulated_clock_frequency{emulated_clock_frequency}, is_native{is_native} {}
+
+    u64 emulated_cpu_frequency;
+    u64 emulated_clock_frequency;
+
+private:
+    bool is_native;
+};
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency);
+
+} // namespace Common
diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp
index c9349a6b4..fccd2eee5 100644
--- a/src/common/x64/cpu_detect.cpp
+++ b/src/common/x64/cpu_detect.cpp
@@ -62,6 +62,17 @@ static CPUCaps Detect() {
     std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int));
     std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int));
     std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int));
+    if (cpu_id[1] == 0x756e6547 && cpu_id[2] == 0x6c65746e && cpu_id[3] == 0x49656e69)
+        caps.manufacturer = Manufacturer::Intel;
+    else if (cpu_id[1] == 0x68747541 && cpu_id[2] == 0x444d4163 && cpu_id[3] == 0x69746e65)
+        caps.manufacturer = Manufacturer::AMD;
+    else if (cpu_id[1] == 0x6f677948 && cpu_id[2] == 0x656e6975 && cpu_id[3] == 0x6e65476e)
+        caps.manufacturer = Manufacturer::Hygon;
+    else
+        caps.manufacturer = Manufacturer::Unknown;
+
+    u32 family = {};
+    u32 model = {};
 
     __cpuid(cpu_id, 0x80000000);
 
@@ -73,6 +84,14 @@ static CPUCaps Detect() {
     // Detect family and other miscellaneous features
     if (max_std_fn >= 1) {
         __cpuid(cpu_id, 0x00000001);
+        family = (cpu_id[0] >> 8) & 0xf;
+        model = (cpu_id[0] >> 4) & 0xf;
+        if (family == 0xf) {
+            family += (cpu_id[0] >> 20) & 0xff;
+        }
+        if (family >= 6) {
+            model += ((cpu_id[0] >> 16) & 0xf) << 4;
+        }
 
         if ((cpu_id[3] >> 25) & 1)
             caps.sse = true;
@@ -110,6 +129,11 @@ static CPUCaps Detect() {
                 caps.bmi1 = true;
             if ((cpu_id[1] >> 8) & 1)
                 caps.bmi2 = true;
+            // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP)
+            if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 &&
+                (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) {
+                caps.avx512 = caps.avx2;
+            }
         }
     }
 
@@ -130,6 +154,20 @@ static CPUCaps Detect() {
             caps.fma4 = true;
     }
 
+    if (max_ex_fn >= 0x80000007) {
+        __cpuid(cpu_id, 0x80000007);
+        if (cpu_id[3] & (1 << 8)) {
+            caps.invariant_tsc = true;
+        }
+    }
+
+    if (max_std_fn >= 0x16) {
+        __cpuid(cpu_id, 0x16);
+        caps.base_frequency = cpu_id[0];
+        caps.max_frequency = cpu_id[1];
+        caps.bus_frequency = cpu_id[2];
+    }
+
     return caps;
 }
 
diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h
index 20f2ba234..e3b63302e 100644
--- a/src/common/x64/cpu_detect.h
+++ b/src/common/x64/cpu_detect.h
@@ -6,8 +6,16 @@
 
 namespace Common {
 
+enum class Manufacturer : u32 {
+    Intel = 0,
+    AMD = 1,
+    Hygon = 2,
+    Unknown = 3,
+};
+
 /// x86/x64 CPU capabilities that may be detected by this module
 struct CPUCaps {
+    Manufacturer manufacturer;
     char cpu_string[0x21];
     char brand_string[0x41];
     bool sse;
@@ -19,11 +27,16 @@ struct CPUCaps {
     bool lzcnt;
     bool avx;
     bool avx2;
+    bool avx512;
     bool bmi1;
     bool bmi2;
     bool fma;
     bool fma4;
     bool aes;
+    bool invariant_tsc;
+    u32 base_frequency;
+    u32 max_frequency;
+    u32 bus_frequency;
 };
 
 /**
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
new file mode 100644
index 000000000..424b39b1f
--- /dev/null
+++ b/src/common/x64/native_clock.cpp
@@ -0,0 +1,103 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <chrono>
+#include <mutex>
+#include <thread>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include "common/uint128.h"
+#include "common/x64/native_clock.h"
+
+namespace Common {
+
+u64 EstimateRDTSCFrequency() {
+    const auto milli_10 = std::chrono::milliseconds{10};
+    // get current time
+    _mm_mfence();
+    const u64 tscStart = __rdtsc();
+    const auto startTime = std::chrono::high_resolution_clock::now();
+    // wait roughly 3 seconds
+    while (true) {
+        auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::high_resolution_clock::now() - startTime);
+        if (milli.count() >= 3000)
+            break;
+        std::this_thread::sleep_for(milli_10);
+    }
+    const auto endTime = std::chrono::high_resolution_clock::now();
+    _mm_mfence();
+    const u64 tscEnd = __rdtsc();
+    // calculate difference
+    const u64 timer_diff =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime).count();
+    const u64 tsc_diff = tscEnd - tscStart;
+    const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff);
+    return tsc_freq;
+}
+
+namespace X64 {
+NativeClock::NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency,
+                         u64 rtsc_frequency)
+    : WallClock(emulated_cpu_frequency, emulated_clock_frequency, true), rtsc_frequency{
+                                                                             rtsc_frequency} {
+    _mm_mfence();
+    last_measure = __rdtsc();
+    accumulated_ticks = 0U;
+}
+
+u64 NativeClock::GetRTSC() {
+    std::scoped_lock scope{rtsc_serialize};
+    _mm_mfence();
+    const u64 current_measure = __rdtsc();
+    u64 diff = current_measure - last_measure;
+    diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
+    if (current_measure > last_measure) {
+        last_measure = current_measure;
+    }
+    accumulated_ticks += diff;
+    /// The clock cannot be more precise than the guest timer, remove the lower bits
+    return accumulated_ticks & inaccuracy_mask;
+}
+
+void NativeClock::Pause(bool is_paused) {
+    if (!is_paused) {
+        _mm_mfence();
+        last_measure = __rdtsc();
+    }
+}
+
+std::chrono::nanoseconds NativeClock::GetTimeNS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)};
+}
+
+std::chrono::microseconds NativeClock::GetTimeUS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)};
+}
+
+std::chrono::milliseconds NativeClock::GetTimeMS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)};
+}
+
+u64 NativeClock::GetClockCycles() {
+    const u64 rtsc_value = GetRTSC();
+    return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency);
+}
+
+u64 NativeClock::GetCPUCycles() {
+    const u64 rtsc_value = GetRTSC();
+    return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency);
+}
+
+} // namespace X64
+
+} // namespace Common
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
new file mode 100644
index 000000000..891a3bbfd
--- /dev/null
+++ b/src/common/x64/native_clock.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <optional>
+
+#include "common/spin_lock.h"
+#include "common/wall_clock.h"
+
+namespace Common {
+
+namespace X64 {
+class NativeClock : public WallClock {
+public:
+    NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, u64 rtsc_frequency);
+
+    std::chrono::nanoseconds GetTimeNS() override;
+
+    std::chrono::microseconds GetTimeUS() override;
+
+    std::chrono::milliseconds GetTimeMS() override;
+
+    u64 GetClockCycles() override;
+
+    u64 GetCPUCycles() override;
+
+    void Pause(bool is_paused) override;
+
+private:
+    u64 GetRTSC();
+
+    /// value used to reduce the native clocks accuracy as some apss rely on
+    /// undefined behavior where the level of accuracy in the clock shouldn't
+    /// be higher.
+    static constexpr u64 inaccuracy_mask = ~(0x400 - 1);
+
+    SpinLock rtsc_serialize{};
+    u64 last_measure{};
+    u64 accumulated_ticks{};
+    u64 rtsc_frequency;
+};
+} // namespace X64
+
+u64 EstimateRDTSCFrequency();
+
+} // namespace Common
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
index 794da8a52..a5f5d4fc1 100644
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -11,7 +11,7 @@
 
 namespace Common::X64 {
 
-inline int RegToIndex(const Xbyak::Reg& reg) {
+inline std::size_t RegToIndex(const Xbyak::Reg& reg) {
     using Kind = Xbyak::Reg::Kind;
     ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
                "RegSet only support GPRs and XMM registers.");
@@ -19,17 +19,17 @@ inline int RegToIndex(const Xbyak::Reg& reg) {
     return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
 }
 
-inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) {
     ASSERT(reg_index < 16);
-    return Xbyak::Reg64(reg_index);
+    return Xbyak::Reg64(static_cast<int>(reg_index));
 }
 
-inline Xbyak::Xmm IndexToXmm(int reg_index) {
+inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) {
     ASSERT(reg_index >= 16 && reg_index < 32);
-    return Xbyak::Xmm(reg_index - 16);
+    return Xbyak::Xmm(static_cast<int>(reg_index - 16));
 }
 
-inline Xbyak::Reg IndexToReg(int reg_index) {
+inline Xbyak::Reg IndexToReg(std::size_t reg_index) {
     if (reg_index < 16) {
         return IndexToReg64(reg_index);
     } else {
@@ -151,9 +151,13 @@ constexpr size_t ABI_SHADOW_SPACE = 0;
 
 #endif
 
-inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
-                                   size_t needed_frame_size, s32* out_subtraction,
-                                   s32* out_xmm_offset) {
+struct ABIFrameInfo {
+    s32 subtraction;
+    s32 xmm_offset;
+};
+
+inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                           size_t needed_frame_size) {
     const auto count = (regs & ABI_ALL_GPRS).count();
     rsp_alignment -= count * 8;
     size_t subtraction = 0;
@@ -170,33 +174,28 @@ inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
     rsp_alignment -= subtraction;
     subtraction += rsp_alignment & 0xF;
 
-    *out_subtraction = (s32)subtraction;
-    *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+    return ABIFrameInfo{static_cast<s32>(subtraction),
+                        static_cast<s32>(subtraction - xmm_base_subtraction)};
 }
 
 inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                               size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
+
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
+            code.push(IndexToReg64(i));
         }
     }
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
 
-    for (int i = 0; i < regs.count(); i++) {
-        if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
-            code.push(IndexToReg64(i));
-        }
+    if (frame_info.subtraction != 0) {
+        code.sub(code.rsp, frame_info.subtraction);
     }
 
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
-            xmm_offset += 0x10;
+            code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
+            frame_info.xmm_offset += 0x10;
         }
     }
 
@@ -205,59 +204,23 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b
 
 inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                            size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
 
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
-            xmm_offset += 0x10;
+            code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
+            frame_info.xmm_offset += 0x10;
         }
     }
 
-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
+    if (frame_info.subtraction != 0) {
+        code.add(code.rsp, frame_info.subtraction);
     }
 
     // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
-            code.pop(IndexToReg64(i));
-        }
-    }
-}
-
-inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                                 size_t rsp_alignment,
-                                                 size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (std::size_t j = 0; j < regs.size(); ++j) {
+        const std::size_t i = regs.size() - j - 1;
         if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
-        }
-    }
-
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
-
-    return ABI_SHADOW_SPACE;
-}
-
-inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
-    }
-
-    // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
             code.pop(IndexToReg64(i));
         }
     }
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 47418006b..d1f173f42 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -7,6 +7,16 @@ endif()
 add_library(core STATIC
     arm/arm_interface.h
     arm/arm_interface.cpp
+    arm/cpu_interrupt_handler.cpp
+    arm/cpu_interrupt_handler.h
+    arm/dynarmic/arm_dynarmic_32.cpp
+    arm/dynarmic/arm_dynarmic_32.h
+    arm/dynarmic/arm_dynarmic_64.cpp
+    arm/dynarmic/arm_dynarmic_64.h
+    arm/dynarmic/arm_dynarmic_cp15.cpp
+    arm/dynarmic/arm_dynarmic_cp15.h
+    arm/dynarmic/arm_exclusive_monitor.cpp
+    arm/dynarmic/arm_exclusive_monitor.h
     arm/exclusive_monitor.cpp
     arm/exclusive_monitor.h
     arm/unicorn/arm_unicorn.cpp
@@ -15,8 +25,6 @@ add_library(core STATIC
     constants.h
     core.cpp
     core.h
-    core_manager.cpp
-    core_manager.h
     core_timing.cpp
     core_timing.h
     core_timing_util.cpp
@@ -606,11 +614,11 @@ endif()
 create_target_directory_groups(core)
 
 target_link_libraries(core PUBLIC common PRIVATE audio_core video_core)
-target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn)
+target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls opus unicorn zip)
 
 if (YUZU_ENABLE_BOXCAT)
     target_compile_definitions(core PRIVATE -DYUZU_ENABLE_BOXCAT)
-    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json zip)
+    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json)
 endif()
 
 if (ENABLE_WEB_SERVICE)
diff --git a/src/core/arm/arm_interface.cpp b/src/core/arm/arm_interface.cpp
index d079a1bc8..d2295ed90 100644
--- a/src/core/arm/arm_interface.cpp
+++ b/src/core/arm/arm_interface.cpp
@@ -139,6 +139,63 @@ std::optional<std::string> GetSymbolName(const Symbols& symbols, VAddr func_addr
 
 constexpr u64 SEGMENT_BASE = 0x7100000000ull;
 
+std::vector<ARM_Interface::BacktraceEntry> ARM_Interface::GetBacktraceFromContext(
+    System& system, const ThreadContext64& ctx) {
+    std::vector<BacktraceEntry> out;
+    auto& memory = system.Memory();
+
+    auto fp = ctx.cpu_registers[29];
+    auto lr = ctx.cpu_registers[30];
+    while (true) {
+        out.push_back({"", 0, lr, 0});
+        if (!fp) {
+            break;
+        }
+        lr = memory.Read64(fp + 8) - 4;
+        fp = memory.Read64(fp);
+    }
+
+    std::map<VAddr, std::string> modules;
+    auto& loader{system.GetAppLoader()};
+    if (loader.ReadNSOModules(modules) != Loader::ResultStatus::Success) {
+        return {};
+    }
+
+    std::map<std::string, Symbols> symbols;
+    for (const auto& module : modules) {
+        symbols.insert_or_assign(module.second, GetSymbols(module.first, memory));
+    }
+
+    for (auto& entry : out) {
+        VAddr base = 0;
+        for (auto iter = modules.rbegin(); iter != modules.rend(); ++iter) {
+            const auto& module{*iter};
+            if (entry.original_address >= module.first) {
+                entry.module = module.second;
+                base = module.first;
+                break;
+            }
+        }
+
+        entry.offset = entry.original_address - base;
+        entry.address = SEGMENT_BASE + entry.offset;
+
+        if (entry.module.empty())
+            entry.module = "unknown";
+
+        const auto symbol_set = symbols.find(entry.module);
+        if (symbol_set != symbols.end()) {
+            const auto symbol = GetSymbolName(symbol_set->second, entry.offset);
+            if (symbol.has_value()) {
+                // TODO(DarkLordZach): Add demangling of symbol names.
+                entry.name = *symbol;
+            }
+        }
+    }
+
+    return out;
+}
+
 std::vector<ARM_Interface::BacktraceEntry> ARM_Interface::GetBacktrace() const {
     std::vector<BacktraceEntry> out;
     auto& memory = system.Memory();
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index cb2e640e2..1f24051e4 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <vector>
 #include "common/common_types.h"
+#include "core/hardware_properties.h"
 
 namespace Common {
 struct PageTable;
@@ -18,25 +19,29 @@ enum class VMAPermission : u8;
 
 namespace Core {
 class System;
+class CPUInterruptHandler;
+
+using CPUInterrupts = std::array<CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>;
 
 /// Generic ARMv8 CPU interface
 class ARM_Interface : NonCopyable {
 public:
-    explicit ARM_Interface(System& system_) : system{system_} {}
+    explicit ARM_Interface(System& system_, CPUInterrupts& interrupt_handlers, bool uses_wall_clock)
+        : system{system_}, interrupt_handlers{interrupt_handlers}, uses_wall_clock{
+                                                                       uses_wall_clock} {}
     virtual ~ARM_Interface() = default;
 
     struct ThreadContext32 {
         std::array<u32, 16> cpu_registers{};
+        std::array<u32, 64> extension_registers{};
         u32 cpsr{};
-        std::array<u8, 4> padding{};
-        std::array<u64, 32> fprs{};
         u32 fpscr{};
         u32 fpexc{};
         u32 tpidr{};
     };
     // Internally within the kernel, it expects the AArch32 version of the
     // thread context to be 344 bytes in size.
-    static_assert(sizeof(ThreadContext32) == 0x158);
+    static_assert(sizeof(ThreadContext32) == 0x150);
 
     struct ThreadContext64 {
         std::array<u64, 31> cpu_registers{};
@@ -143,6 +148,8 @@ public:
      */
     virtual void SetTPIDR_EL0(u64 value) = 0;
 
+    virtual void ChangeProcessorID(std::size_t new_core_id) = 0;
+
     virtual void SaveContext(ThreadContext32& ctx) = 0;
     virtual void SaveContext(ThreadContext64& ctx) = 0;
     virtual void LoadContext(const ThreadContext32& ctx) = 0;
@@ -162,6 +169,9 @@ public:
         std::string name;
     };
 
+    static std::vector<BacktraceEntry> GetBacktraceFromContext(System& system,
+                                                               const ThreadContext64& ctx);
+
     std::vector<BacktraceEntry> GetBacktrace() const;
 
     /// fp (= r29) points to the last frame record.
@@ -175,6 +185,8 @@ public:
 protected:
     /// System context that this ARM interface is running under.
     System& system;
+    CPUInterrupts& interrupt_handlers;
+    bool uses_wall_clock;
 };
 
 } // namespace Core
diff --git a/src/core/arm/cpu_interrupt_handler.cpp b/src/core/arm/cpu_interrupt_handler.cpp
new file mode 100644
index 000000000..df0350881
--- /dev/null
+++ b/src/core/arm/cpu_interrupt_handler.cpp
@@ -0,0 +1,27 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/thread.h"
+#include "core/arm/cpu_interrupt_handler.h"
+
+namespace Core {
+
+CPUInterruptHandler::CPUInterruptHandler() : is_interrupted{} {
+    interrupt_event = std::make_unique<Common::Event>();
+}
+
+CPUInterruptHandler::~CPUInterruptHandler() = default;
+
+void CPUInterruptHandler::SetInterrupt(bool is_interrupted_) {
+    if (is_interrupted_) {
+        interrupt_event->Set();
+    }
+    this->is_interrupted = is_interrupted_;
+}
+
+void CPUInterruptHandler::AwaitInterrupt() {
+    interrupt_event->Wait();
+}
+
+} // namespace Core
diff --git a/src/core/arm/cpu_interrupt_handler.h b/src/core/arm/cpu_interrupt_handler.h
new file mode 100644
index 000000000..3d062d326
--- /dev/null
+++ b/src/core/arm/cpu_interrupt_handler.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+namespace Common {
+class Event;
+}
+
+namespace Core {
+
+class CPUInterruptHandler {
+public:
+    CPUInterruptHandler();
+    ~CPUInterruptHandler();
+
+    CPUInterruptHandler(const CPUInterruptHandler&) = delete;
+    CPUInterruptHandler& operator=(const CPUInterruptHandler&) = delete;
+
+    CPUInterruptHandler(CPUInterruptHandler&&) = default;
+    CPUInterruptHandler& operator=(CPUInterruptHandler&&) = default;
+
+    bool IsInterrupted() const {
+        return is_interrupted;
+    }
+
+    void SetInterrupt(bool is_interrupted);
+
+    void AwaitInterrupt();
+
+private:
+    bool is_interrupted{};
+    std::unique_ptr<Common::Event> interrupt_event;
+};
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index 9bc86e3b9..0d4ab95b7 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -7,15 +7,17 @@
 #include <dynarmic/A32/a32.h>
 #include <dynarmic/A32/config.h>
 #include <dynarmic/A32/context.h>
-#include "common/microprofile.h"
+#include "common/logging/log.h"
+#include "common/page_table.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
-#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/svc.h"
 #include "core/memory.h"
+#include "core/settings.h"
 
 namespace Core {
 
@@ -49,8 +51,22 @@ public:
         parent.system.Memory().Write64(vaddr, value);
     }
 
+    bool MemoryWriteExclusive8(u32 vaddr, u8 value, u8 expected) override {
+        return parent.system.Memory().WriteExclusive8(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive16(u32 vaddr, u16 value, u16 expected) override {
+        return parent.system.Memory().WriteExclusive16(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive32(u32 vaddr, u32 value, u32 expected) override {
+        return parent.system.Memory().WriteExclusive32(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive64(u32 vaddr, u64 value, u64 expected) override {
+        return parent.system.Memory().WriteExclusive64(vaddr, value, expected);
+    }
+
     void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("This should never happen, pc = {:08X}, code = {:08X}", pc,
+                          MemoryReadCode(pc));
     }
 
     void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
@@ -61,7 +77,7 @@ public:
         case Dynarmic::A32::Exception::Breakpoint:
             break;
         }
-        LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+        LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
                      static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
         UNIMPLEMENTED();
     }
@@ -71,26 +87,36 @@ public:
     }
 
     void AddTicks(u64 ticks) override {
+        if (parent.uses_wall_clock) {
+            return;
+        }
         // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
         // rough approximation of the amount of executed ticks in the system, it may be thrown off
         // if not all cores are doing a similar amount of work. Instead of doing this, we should
         // device a way so that timing is consistent across all cores without increasing the ticks 4
         // times.
-        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        u64 amortized_ticks =
+            (ticks - num_interpreted_instructions) / Core::Hardware::NUM_CPU_CORES;
         // Always execute at least one tick.
         amortized_ticks = std::max<u64>(amortized_ticks, 1);
 
         parent.system.CoreTiming().AddTicks(amortized_ticks);
         num_interpreted_instructions = 0;
     }
+
     u64 GetTicksRemaining() override {
-        return std::max(parent.system.CoreTiming().GetDowncount(), {});
+        if (parent.uses_wall_clock) {
+            if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) {
+                return minimum_run_cycles;
+            }
+            return 0U;
+        }
+        return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0);
     }
 
     ARM_Dynarmic_32& parent;
     std::size_t num_interpreted_instructions{};
-    u64 tpidrro_el0{};
-    u64 tpidr_el0{};
+    static constexpr u64 minimum_run_cycles = 1000U;
 };
 
 std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
@@ -99,26 +125,46 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable&
     config.callbacks = cb.get();
     // TODO(bunnei): Implement page table for 32-bit
     // config.page_table = &page_table.pointers;
-    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.coprocessors[15] = cp15;
     config.define_unpredictable_behaviour = true;
+    static constexpr std::size_t PAGE_BITS = 12;
+    static constexpr std::size_t NUM_PAGE_TABLE_ENTRIES = 1 << (32 - PAGE_BITS);
+    config.page_table = reinterpret_cast<std::array<std::uint8_t*, NUM_PAGE_TABLE_ENTRIES>*>(
+        page_table.pointers.data());
+    config.absolute_offset_page_table = true;
+    config.detect_misaligned_access_via_page_table = 16 | 32 | 64 | 128;
+    config.only_detect_misalignment_via_page_table_on_page_boundary = true;
+
+    // Multi-process state
+    config.processor_id = core_index;
+    config.global_monitor = &exclusive_monitor.monitor;
+
+    // Timing
+    config.wall_clock_cntpct = uses_wall_clock;
+
+    // Optimizations
+    if (Settings::values.disable_cpu_opt) {
+        config.enable_optimizations = false;
+        config.enable_fast_dispatch = false;
+    }
+
     return std::make_unique<Dynarmic::A32::Jit>(config);
 }
 
-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_32, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
-
 void ARM_Dynarmic_32::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_32);
     jit->Run();
 }
 
 void ARM_Dynarmic_32::Step() {
-    cb->InterpreterFallback(jit->Regs()[15], 1);
+    jit->Step();
 }
 
-ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
+ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, CPUInterrupts& interrupt_handlers,
+                                 bool uses_wall_clock, ExclusiveMonitor& exclusive_monitor,
                                  std::size_t core_index)
-    : ARM_Interface{system},
-      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock},
+      cb(std::make_unique<DynarmicCallbacks32>(*this)),
+      cp15(std::make_shared<DynarmicCP15>(*this)), core_index{core_index},
       exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
 
 ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
@@ -154,32 +200,40 @@ void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
 }
 
 u64 ARM_Dynarmic_32::GetTlsAddress() const {
-    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+    return cp15->uro;
 }
 
 void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
-    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+    cp15->uro = static_cast<u32>(address);
 }
 
 u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
-    return cb->tpidr_el0;
+    return cp15->uprw;
 }
 
 void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
-    cb->tpidr_el0 = value;
+    cp15->uprw = static_cast<u32>(value);
+}
+
+void ARM_Dynarmic_32::ChangeProcessorID(std::size_t new_core_id) {
+    jit->ChangeProcessorID(new_core_id);
 }
 
 void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
     Dynarmic::A32::Context context;
     jit->SaveContext(context);
     ctx.cpu_registers = context.Regs();
+    ctx.extension_registers = context.ExtRegs();
     ctx.cpsr = context.Cpsr();
+    ctx.fpscr = context.Fpscr();
 }
 
 void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) {
     Dynarmic::A32::Context context;
     context.Regs() = ctx.cpu_registers;
+    context.ExtRegs() = ctx.extension_registers;
     context.SetCpsr(ctx.cpsr);
+    context.SetFpscr(ctx.fpscr);
     jit->LoadContext(context);
 }
 
@@ -188,10 +242,15 @@ void ARM_Dynarmic_32::PrepareReschedule() {
 }
 
 void ARM_Dynarmic_32::ClearInstructionCache() {
+    if (!jit) {
+        return;
+    }
     jit->ClearCache();
 }
 
-void ARM_Dynarmic_32::ClearExclusiveState() {}
+void ARM_Dynarmic_32::ClearExclusiveState() {
+    jit->ClearExclusiveState();
+}
 
 void ARM_Dynarmic_32::PageTableChanged(Common::PageTable& page_table,
                                        std::size_t new_address_space_size_in_bits) {
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h
index 8ba9cea8f..2bab31b92 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -9,7 +9,7 @@
 
 #include <dynarmic/A32/a32.h>
 #include <dynarmic/A64/a64.h>
-#include <dynarmic/A64/exclusive_monitor.h>
+#include <dynarmic/exclusive_monitor.h>
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "core/arm/arm_interface.h"
@@ -21,13 +21,16 @@ class Memory;
 
 namespace Core {
 
+class CPUInterruptHandler;
 class DynarmicCallbacks32;
+class DynarmicCP15;
 class DynarmicExclusiveMonitor;
 class System;
 
 class ARM_Dynarmic_32 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_32(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
     ~ARM_Dynarmic_32() override;
 
     void SetPC(u64 pc) override;
@@ -44,6 +47,7 @@ public:
     void SetTlsAddress(VAddr address) override;
     void SetTPIDR_EL0(u64 value) override;
     u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;
 
     void SaveContext(ThreadContext32& ctx) override;
     void SaveContext(ThreadContext64& ctx) override {}
@@ -66,12 +70,14 @@ private:
         std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;
 
     friend class DynarmicCallbacks32;
+    friend class DynarmicCP15;
+
     std::unique_ptr<DynarmicCallbacks32> cb;
     JitCacheType jit_cache;
     std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::shared_ptr<DynarmicCP15> cp15;
     std::size_t core_index;
     DynarmicExclusiveMonitor& exclusive_monitor;
-    std::array<u32, 84> CP15_regs{};
 };
 
 } // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index 337b97be9..790981034 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -7,11 +7,11 @@
 #include <dynarmic/A64/a64.h>
 #include <dynarmic/A64/config.h>
 #include "common/logging/log.h"
-#include "common/microprofile.h"
 #include "common/page_table.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/gdbstub/gdbstub.h"
@@ -65,6 +65,22 @@ public:
         memory.Write64(vaddr + 8, value[1]);
     }
 
+    bool MemoryWriteExclusive8(u64 vaddr, std::uint8_t value, std::uint8_t expected) override {
+        return parent.system.Memory().WriteExclusive8(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive16(u64 vaddr, std::uint16_t value, std::uint16_t expected) override {
+        return parent.system.Memory().WriteExclusive16(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive32(u64 vaddr, std::uint32_t value, std::uint32_t expected) override {
+        return parent.system.Memory().WriteExclusive32(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive64(u64 vaddr, std::uint64_t value, std::uint64_t expected) override {
+        return parent.system.Memory().WriteExclusive64(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive128(u64 vaddr, Vector value, Vector expected) override {
+        return parent.system.Memory().WriteExclusive128(vaddr, value, expected);
+    }
+
     void InterpreterFallback(u64 pc, std::size_t num_instructions) override {
         LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc,
                  num_instructions, MemoryReadCode(pc));
@@ -98,8 +114,8 @@ public:
             }
             [[fallthrough]];
         default:
-            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:X})",
-                       static_cast<std::size_t>(exception), pc);
+            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+                       static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
         }
     }
 
@@ -108,29 +124,42 @@ public:
     }
 
     void AddTicks(u64 ticks) override {
+        if (parent.uses_wall_clock) {
+            return;
+        }
         // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
         // rough approximation of the amount of executed ticks in the system, it may be thrown off
         // if not all cores are doing a similar amount of work. Instead of doing this, we should
         // device a way so that timing is consistent across all cores without increasing the ticks 4
         // times.
-        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        u64 amortized_ticks =
+            (ticks - num_interpreted_instructions) / Core::Hardware::NUM_CPU_CORES;
         // Always execute at least one tick.
         amortized_ticks = std::max<u64>(amortized_ticks, 1);
 
         parent.system.CoreTiming().AddTicks(amortized_ticks);
         num_interpreted_instructions = 0;
     }
+
     u64 GetTicksRemaining() override {
-        return std::max(parent.system.CoreTiming().GetDowncount(), s64{0});
+        if (parent.uses_wall_clock) {
+            if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) {
+                return minimum_run_cycles;
+            }
+            return 0U;
+        }
+        return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0);
     }
+
     u64 GetCNTPCT() override {
-        return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
+        return parent.system.CoreTiming().GetClockTicks();
     }
 
     ARM_Dynarmic_64& parent;
     std::size_t num_interpreted_instructions = 0;
     u64 tpidrro_el0 = 0;
     u64 tpidr_el0 = 0;
+    static constexpr u64 minimum_run_cycles = 1000U;
 };
 
 std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable& page_table,
@@ -168,14 +197,13 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable&
         config.enable_fast_dispatch = false;
     }
 
+    // Timing
+    config.wall_clock_cntpct = uses_wall_clock;
+
     return std::make_shared<Dynarmic::A64::Jit>(config);
 }
 
-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_64, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
-
 void ARM_Dynarmic_64::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_64);
-
     jit->Run();
 }
 
@@ -183,11 +211,16 @@ void ARM_Dynarmic_64::Step() {
     cb->InterpreterFallback(jit->GetPC(), 1);
 }
 
-ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor,
+ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, CPUInterrupts& interrupt_handlers,
+                                 bool uses_wall_clock, ExclusiveMonitor& exclusive_monitor,
                                  std::size_t core_index)
-    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks64>(*this)),
-      inner_unicorn{system, ARM_Unicorn::Arch::AArch64}, core_index{core_index},
-      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock},
+      cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system, interrupt_handlers,
+                                                                      uses_wall_clock,
+                                                                      ARM_Unicorn::Arch::AArch64,
+                                                                      core_index},
+      core_index{core_index}, exclusive_monitor{
+                                  dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
 
 ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;
 
@@ -239,6 +272,10 @@ void ARM_Dynarmic_64::SetTPIDR_EL0(u64 value) {
     cb->tpidr_el0 = value;
 }
 
+void ARM_Dynarmic_64::ChangeProcessorID(std::size_t new_core_id) {
+    jit->ChangeProcessorID(new_core_id);
+}
+
 void ARM_Dynarmic_64::SaveContext(ThreadContext64& ctx) {
     ctx.cpu_registers = jit->GetRegisters();
     ctx.sp = jit->GetSP();
@@ -266,6 +303,9 @@ void ARM_Dynarmic_64::PrepareReschedule() {
 }
 
 void ARM_Dynarmic_64::ClearInstructionCache() {
+    if (!jit) {
+        return;
+    }
     jit->ClearCache();
 }
 
@@ -285,44 +325,4 @@ void ARM_Dynarmic_64::PageTableChanged(Common::PageTable& page_table,
     jit_cache.emplace(key, jit);
 }
 
-DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
-    : monitor(core_count), memory{memory} {}
-
-DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;
-
-void DynarmicExclusiveMonitor::SetExclusive(std::size_t core_index, VAddr addr) {
-    // Size doesn't actually matter.
-    monitor.Mark(core_index, addr, 16);
-}
-
-void DynarmicExclusiveMonitor::ClearExclusive() {
-    monitor.Clear();
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 1, [&] { memory.Write8(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 2,
-                                        [&] { memory.Write16(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 4,
-                                        [&] { memory.Write32(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 8,
-                                        [&] { memory.Write64(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 16, [&] {
-        memory.Write64(vaddr + 0, value[0]);
-        memory.Write64(vaddr + 8, value[1]);
-    });
-}
-
 } // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.h b/src/core/arm/dynarmic/arm_dynarmic_64.h
index 647cecaf0..403c55961 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -8,7 +8,6 @@
 #include <unordered_map>
 
 #include <dynarmic/A64/a64.h>
-#include <dynarmic/A64/exclusive_monitor.h>
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "core/arm/arm_interface.h"
@@ -22,12 +21,14 @@ class Memory;
 namespace Core {
 
 class DynarmicCallbacks64;
+class CPUInterruptHandler;
 class DynarmicExclusiveMonitor;
 class System;
 
 class ARM_Dynarmic_64 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_64(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
     ~ARM_Dynarmic_64() override;
 
     void SetPC(u64 pc) override;
@@ -44,6 +45,7 @@ public:
     void SetTlsAddress(VAddr address) override;
     void SetTPIDR_EL0(u64 value) override;
     u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;
 
     void SaveContext(ThreadContext32& ctx) override {}
     void SaveContext(ThreadContext64& ctx) override;
@@ -75,24 +77,4 @@ private:
     DynarmicExclusiveMonitor& exclusive_monitor;
 };
 
-class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
-public:
-    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
-    ~DynarmicExclusiveMonitor() override;
-
-    void SetExclusive(std::size_t core_index, VAddr addr) override;
-    void ClearExclusive() override;
-
-    bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override;
-    bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override;
-    bool ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) override;
-    bool ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) override;
-    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;
-
-private:
-    friend class ARM_Dynarmic_64;
-    Dynarmic::A64::ExclusiveMonitor monitor;
-    Core::Memory::Memory& memory;
-};
-
 } // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
index 3fdcdebde..54556e0f9 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
@@ -2,79 +2,132 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <fmt/format.h>
+#include "common/logging/log.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
 
 using Callback = Dynarmic::A32::Coprocessor::Callback;
 using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
 using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;
 
+template <>
+struct fmt::formatter<Dynarmic::A32::CoprocReg> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.begin();
+    }
+    template <typename FormatContext>
+    auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) {
+        return format_to(ctx.out(), "cp{}", static_cast<size_t>(reg));
+    }
+};
+
+namespace Core {
+
+static u32 dummy_value;
+
 std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
                                                                CoprocReg CRd, CoprocReg CRn,
                                                                CoprocReg CRm, unsigned opc2) {
+    LOG_CRITICAL(Core_ARM, "CP15: cdp{} p15, {}, {}, {}, {}, {}", two ? "2" : "", opc1, CRd, CRn,
+                 CRm, opc2);
     return {};
 }
 
 CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                          CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
     if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // CP15_FLUSH_PREFETCH_BUFFER
         // This is a dummy write, we ignore the value written here.
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+        return &dummy_value;
     }
 
     if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
         switch (opc2) {
         case 4:
+            // CP15_DATA_SYNC_BARRIER
             // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+            return &dummy_value;
         case 5:
+            // CP15_DATA_MEMORY_BARRIER
             // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
-        default:
-            return {};
+            return &dummy_value;
         }
     }
 
     if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        // CP15_THREAD_UPRW
+        return &uprw;
     }
 
+    LOG_CRITICAL(Core_ARM, "CP15: mcr{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
     return {};
 }
 
 CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    LOG_CRITICAL(Core_ARM, "CP15: mcrr{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
     return {};
 }
 
 CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                         CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
     if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
         switch (opc2) {
         case 2:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+            // CP15_THREAD_UPRW
+            return &uprw;
         case 3:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
-        default:
-            return {};
+            // CP15_THREAD_URO
+            return &uro;
         }
     }
 
+    LOG_CRITICAL(Core_ARM, "CP15: mrc{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
     return {};
 }
 
 CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    if (!two && opc == 0 && CRm == CoprocReg::C14) {
+        // CNTPCT
+        const auto callback = static_cast<u64 (*)(Dynarmic::A32::Jit*, void*, u32, u32)>(
+            [](Dynarmic::A32::Jit*, void* arg, u32, u32) -> u64 {
+                ARM_Dynarmic_32& parent = *(ARM_Dynarmic_32*)arg;
+                return parent.system.CoreTiming().GetClockTicks();
+            });
+        return Dynarmic::A32::Coprocessor::Callback{callback, (void*)&parent};
+    }
+
+    LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
     return {};
 }
 
 std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
                                                        std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
     return {};
 }
 
 std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                                         std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
     return {};
 }
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.h b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
index 07bcde5f9..7356d252e 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
@@ -10,128 +10,15 @@
 #include <dynarmic/A32/coprocessor.h>
 #include "common/common_types.h"
 
-enum class CP15Register {
-    // c0 - Information registers
-    CP15_MAIN_ID,
-    CP15_CACHE_TYPE,
-    CP15_TCM_STATUS,
-    CP15_TLB_TYPE,
-    CP15_CPU_ID,
-    CP15_PROCESSOR_FEATURE_0,
-    CP15_PROCESSOR_FEATURE_1,
-    CP15_DEBUG_FEATURE_0,
-    CP15_AUXILIARY_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_1,
-    CP15_MEMORY_MODEL_FEATURE_2,
-    CP15_MEMORY_MODEL_FEATURE_3,
-    CP15_ISA_FEATURE_0,
-    CP15_ISA_FEATURE_1,
-    CP15_ISA_FEATURE_2,
-    CP15_ISA_FEATURE_3,
-    CP15_ISA_FEATURE_4,
+namespace Core {
 
-    // c1 - Control registers
-    CP15_CONTROL,
-    CP15_AUXILIARY_CONTROL,
-    CP15_COPROCESSOR_ACCESS_CONTROL,
-
-    // c2 - Translation table registers
-    CP15_TRANSLATION_BASE_TABLE_0,
-    CP15_TRANSLATION_BASE_TABLE_1,
-    CP15_TRANSLATION_BASE_CONTROL,
-    CP15_DOMAIN_ACCESS_CONTROL,
-    CP15_RESERVED,
-
-    // c5 - Fault status registers
-    CP15_FAULT_STATUS,
-    CP15_INSTR_FAULT_STATUS,
-    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
-    CP15_INST_FSR,
-
-    // c6 - Fault Address registers
-    CP15_FAULT_ADDRESS,
-    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
-    CP15_WFAR,
-    CP15_IFAR,
-
-    // c7 - Cache operation registers
-    CP15_WAIT_FOR_INTERRUPT,
-    CP15_PHYS_ADDRESS,
-    CP15_INVALIDATE_INSTR_CACHE,
-    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
-    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
-    CP15_FLUSH_PREFETCH_BUFFER,
-    CP15_FLUSH_BRANCH_TARGET_CACHE,
-    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
-    CP15_INVALIDATE_DATA_CACHE,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
-    CP15_CLEAN_DATA_CACHE,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
-    CP15_DATA_SYNC_BARRIER,
-    CP15_DATA_MEMORY_BARRIER,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-
-    // c8 - TLB operations
-    CP15_INVALIDATE_ITLB,
-    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_DTLB,
-    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_UTLB,
-    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
-
-    // c9 - Data cache lockdown register
-    CP15_DATA_CACHE_LOCKDOWN,
-
-    // c10 - TLB/Memory map registers
-    CP15_TLB_LOCKDOWN,
-    CP15_PRIMARY_REGION_REMAP,
-    CP15_NORMAL_REGION_REMAP,
-
-    // c13 - Thread related registers
-    CP15_PID,
-    CP15_CONTEXT_ID,
-    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
-    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
-    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
-
-    // c15 - Performance and TLB lockdown registers
-    CP15_PERFORMANCE_MONITOR_CONTROL,
-    CP15_CYCLE_COUNTER,
-    CP15_COUNT_0,
-    CP15_COUNT_1,
-    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
-    CP15_TLB_DEBUG_CONTROL,
-
-    // Skyeye defined
-    CP15_TLB_FAULT_ADDR,
-    CP15_TLB_FAULT_STATUS,
-
-    // Not an actual register.
-    // All registers should be defined above this.
-    CP15_REGISTER_COUNT,
-};
+class ARM_Dynarmic_32;
 
 class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
 public:
     using CoprocReg = Dynarmic::A32::CoprocReg;
 
-    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+    explicit DynarmicCP15(ARM_Dynarmic_32& parent) : parent(parent) {}
 
     std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
                                                      CoprocReg CRn, CoprocReg CRm,
@@ -147,6 +34,9 @@ public:
     std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                               std::optional<u8> option) override;
 
-private:
-    u32* CP15{};
+    ARM_Dynarmic_32& parent;
+    u32 uprw;
+    u32 uro;
 };
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_exclusive_monitor.cpp b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
new file mode 100644
index 000000000..4e209f6a5
--- /dev/null
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
@@ -0,0 +1,76 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+#include <memory>
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
+#include "core/memory.h"
+
+namespace Core {
+
+DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
+    : monitor(core_count), memory{memory} {}
+
+DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;
+
+u8 DynarmicExclusiveMonitor::ExclusiveRead8(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u8>(core_index, addr, [&]() -> u8 { return memory.Read8(addr); });
+}
+
+u16 DynarmicExclusiveMonitor::ExclusiveRead16(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u16>(core_index, addr, [&]() -> u16 { return memory.Read16(addr); });
+}
+
+u32 DynarmicExclusiveMonitor::ExclusiveRead32(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u32>(core_index, addr, [&]() -> u32 { return memory.Read32(addr); });
+}
+
+u64 DynarmicExclusiveMonitor::ExclusiveRead64(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u64>(core_index, addr, [&]() -> u64 { return memory.Read64(addr); });
+}
+
+u128 DynarmicExclusiveMonitor::ExclusiveRead128(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u128>(core_index, addr, [&]() -> u128 {
+        u128 result;
+        result[0] = memory.Read64(addr);
+        result[1] = memory.Read64(addr + 8);
+        return result;
+    });
+}
+
+void DynarmicExclusiveMonitor::ClearExclusive() {
+    monitor.Clear();
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) {
+    return monitor.DoExclusiveOperation<u8>(core_index, vaddr, [&](u8 expected) -> bool {
+        return memory.WriteExclusive8(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) {
+    return monitor.DoExclusiveOperation<u16>(core_index, vaddr, [&](u16 expected) -> bool {
+        return memory.WriteExclusive16(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) {
+    return monitor.DoExclusiveOperation<u32>(core_index, vaddr, [&](u32 expected) -> bool {
+        return memory.WriteExclusive32(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) {
+    return monitor.DoExclusiveOperation<u64>(core_index, vaddr, [&](u64 expected) -> bool {
+        return memory.WriteExclusive64(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) {
+    return monitor.DoExclusiveOperation<u128>(core_index, vaddr, [&](u128 expected) -> bool {
+        return memory.WriteExclusive128(vaddr, value, expected);
+    });
+}
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_exclusive_monitor.h b/src/core/arm/dynarmic/arm_exclusive_monitor.h
new file mode 100644
index 000000000..964f4a55d
--- /dev/null
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include <dynarmic/exclusive_monitor.h>
+
+#include "common/common_types.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/exclusive_monitor.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Core {
+
+class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
+public:
+    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
+    ~DynarmicExclusiveMonitor() override;
+
+    u8 ExclusiveRead8(std::size_t core_index, VAddr addr) override;
+    u16 ExclusiveRead16(std::size_t core_index, VAddr addr) override;
+    u32 ExclusiveRead32(std::size_t core_index, VAddr addr) override;
+    u64 ExclusiveRead64(std::size_t core_index, VAddr addr) override;
+    u128 ExclusiveRead128(std::size_t core_index, VAddr addr) override;
+    void ClearExclusive() override;
+
+    bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override;
+    bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override;
+    bool ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) override;
+    bool ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) override;
+    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;
+
+private:
+    friend class ARM_Dynarmic_32;
+    friend class ARM_Dynarmic_64;
+    Dynarmic::ExclusiveMonitor monitor;
+    Core::Memory::Memory& memory;
+};
+
+} // namespace Core
diff --git a/src/core/arm/exclusive_monitor.cpp b/src/core/arm/exclusive_monitor.cpp
index b32401e0b..d8cba369d 100644
--- a/src/core/arm/exclusive_monitor.cpp
+++ b/src/core/arm/exclusive_monitor.cpp
@@ -3,7 +3,7 @@
 // Refer to the license.txt file included.
 
 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/memory.h"
diff --git a/src/core/arm/exclusive_monitor.h b/src/core/arm/exclusive_monitor.h
index ccd73b80f..62f6e6023 100644
--- a/src/core/arm/exclusive_monitor.h
+++ b/src/core/arm/exclusive_monitor.h
@@ -18,7 +18,11 @@ class ExclusiveMonitor {
 public:
     virtual ~ExclusiveMonitor();
 
-    virtual void SetExclusive(std::size_t core_index, VAddr addr) = 0;
+    virtual u8 ExclusiveRead8(std::size_t core_index, VAddr addr) = 0;
+    virtual u16 ExclusiveRead16(std::size_t core_index, VAddr addr) = 0;
+    virtual u32 ExclusiveRead32(std::size_t core_index, VAddr addr) = 0;
+    virtual u64 ExclusiveRead64(std::size_t core_index, VAddr addr) = 0;
+    virtual u128 ExclusiveRead128(std::size_t core_index, VAddr addr) = 0;
     virtual void ClearExclusive() = 0;
 
     virtual bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) = 0;
diff --git a/src/core/arm/unicorn/arm_unicorn.cpp b/src/core/arm/unicorn/arm_unicorn.cpp
index e40e9626a..1df3f3ed1 100644
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -6,6 +6,7 @@
 #include <unicorn/arm64.h>
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -62,7 +63,9 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si
     return false;
 }
 
-ARM_Unicorn::ARM_Unicorn(System& system, Arch architecture) : ARM_Interface{system} {
+ARM_Unicorn::ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                         Arch architecture, std::size_t core_index)
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock}, core_index{core_index} {
     const auto arch = architecture == Arch::AArch32 ? UC_ARCH_ARM : UC_ARCH_ARM64;
     CHECKED(uc_open(arch, UC_MODE_ARM, &uc));
 
@@ -156,12 +159,20 @@ void ARM_Unicorn::SetTPIDR_EL0(u64 value) {
     CHECKED(uc_reg_write(uc, UC_ARM64_REG_TPIDR_EL0, &value));
 }
 
+void ARM_Unicorn::ChangeProcessorID(std::size_t new_core_id) {
+    core_index = new_core_id;
+}
+
 void ARM_Unicorn::Run() {
     if (GDBStub::IsServerEnabled()) {
         ExecuteInstructions(std::max(4000000U, 0U));
     } else {
-        ExecuteInstructions(
-            std::max(std::size_t(system.CoreTiming().GetDowncount()), std::size_t{0}));
+        while (true) {
+            if (interrupt_handlers[core_index].IsInterrupted()) {
+                return;
+            }
+            ExecuteInstructions(10);
+        }
     }
 }
 
@@ -183,8 +194,6 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
                            UC_PROT_READ | UC_PROT_WRITE | UC_PROT_EXEC, page_buffer.data()));
     CHECKED(uc_emu_start(uc, GetPC(), 1ULL << 63, 0, num_instructions));
     CHECKED(uc_mem_unmap(uc, map_addr, page_buffer.size()));
-
-    system.CoreTiming().AddTicks(num_instructions);
     if (GDBStub::IsServerEnabled()) {
         if (last_bkpt_hit && last_bkpt.type == GDBStub::BreakpointType::Execute) {
             uc_reg_write(uc, UC_ARM64_REG_PC, &last_bkpt.address);
diff --git a/src/core/arm/unicorn/arm_unicorn.h b/src/core/arm/unicorn/arm_unicorn.h
index 725c65085..810aff311 100644
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -20,7 +20,8 @@ public:
         AArch64, // 64-bit ARM
     };
 
-    explicit ARM_Unicorn(System& system, Arch architecture);
+    explicit ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                         Arch architecture, std::size_t core_index);
     ~ARM_Unicorn() override;
 
     void SetPC(u64 pc) override;
@@ -35,6 +36,7 @@ public:
     void SetTlsAddress(VAddr address) override;
     void SetTPIDR_EL0(u64 value) override;
     u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;
     void PrepareReschedule() override;
     void ClearExclusiveState() override;
     void ExecuteInstructions(std::size_t num_instructions);
@@ -55,6 +57,7 @@ private:
     uc_engine* uc{};
     GDBStub::BreakpointAddress last_bkpt{};
     bool last_bkpt_hit = false;
+    std::size_t core_index;
 };
 
 } // namespace Core
diff --git a/src/core/core.cpp b/src/core/core.cpp
index f9f8a3000..69a1aa0a5 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -8,10 +8,10 @@
 
 #include "common/file_util.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/device_memory.h"
@@ -51,6 +51,11 @@
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
 
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU0, "ARM JIT", "Dynarmic CPU 0", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU1, "ARM JIT", "Dynarmic CPU 1", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU2, "ARM JIT", "Dynarmic CPU 2", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU3, "ARM JIT", "Dynarmic CPU 3", MP_RGB(255, 64, 64));
+
 namespace Core {
 
 namespace {
@@ -117,23 +122,22 @@ struct System::Impl {
         : kernel{system}, fs_controller{system}, memory{system},
           cpu_manager{system}, reporter{system}, applet_manager{system} {}
 
-    CoreManager& CurrentCoreManager() {
-        return cpu_manager.GetCurrentCoreManager();
-    }
+    ResultStatus Run() {
+        status = ResultStatus::Success;
 
-    Kernel::PhysicalCore& CurrentPhysicalCore() {
-        const auto index = cpu_manager.GetActiveCoreIndex();
-        return kernel.PhysicalCore(index);
-    }
+        kernel.Suspend(false);
+        core_timing.SyncPause(false);
+        cpu_manager.Pause(false);
 
-    Kernel::PhysicalCore& GetPhysicalCore(std::size_t index) {
-        return kernel.PhysicalCore(index);
+        return status;
     }
 
-    ResultStatus RunLoop(bool tight_loop) {
+    ResultStatus Pause() {
         status = ResultStatus::Success;
 
-        cpu_manager.RunLoop(tight_loop);
+        core_timing.SyncPause(true);
+        kernel.Suspend(true);
+        cpu_manager.Pause(true);
 
         return status;
     }
@@ -143,14 +147,22 @@ struct System::Impl {
 
         device_memory = std::make_unique<Core::DeviceMemory>(system);
 
-        core_timing.Initialize();
+        is_multicore = Settings::values.use_multi_core.GetValue();
+        is_async_gpu = is_multicore || Settings::values.use_asynchronous_gpu_emulation.GetValue();
+
+        kernel.SetMulticore(is_multicore);
+        cpu_manager.SetMulticore(is_multicore);
+        cpu_manager.SetAsyncGpu(is_async_gpu);
+        core_timing.SetMulticore(is_multicore);
+
+        core_timing.Initialize([&system]() { system.RegisterHostThread(); });
         kernel.Initialize();
         cpu_manager.Initialize();
 
         const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::system_clock::now().time_since_epoch());
         Settings::values.custom_rtc_differential =
-            Settings::values.custom_rtc.value_or(current_time) - current_time;
+            Settings::values.custom_rtc.GetValue().value_or(current_time) - current_time;
 
         // Create a default fs if one doesn't already exist.
         if (virtual_filesystem == nullptr)
@@ -180,6 +192,11 @@ struct System::Impl {
         is_powered_on = true;
         exit_lock = false;
 
+        microprofile_dynarmic[0] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU0);
+        microprofile_dynarmic[1] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU1);
+        microprofile_dynarmic[2] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU2);
+        microprofile_dynarmic[3] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU3);
+
         LOG_DEBUG(Core, "Initialized OK");
 
         return ResultStatus::Success;
@@ -277,8 +294,6 @@ struct System::Impl {
         service_manager.reset();
         cheat_engine.reset();
         telemetry_session.reset();
-        perf_stats.reset();
-        gpu_core.reset();
         device_memory.reset();
 
         // Close all CPU/threading state
@@ -290,6 +305,8 @@ struct System::Impl {
 
         // Close app loader
         app_loader.reset();
+        gpu_core.reset();
+        perf_stats.reset();
 
         // Clear all applets
         applet_manager.ClearAll();
@@ -382,25 +399,35 @@ struct System::Impl {
 
     std::unique_ptr<Core::PerfStats> perf_stats;
     Core::FrameLimiter frame_limiter;
+
+    bool is_multicore{};
+    bool is_async_gpu{};
+
+    std::array<u64, Core::Hardware::NUM_CPU_CORES> dynarmic_ticks{};
+    std::array<MicroProfileToken, Core::Hardware::NUM_CPU_CORES> microprofile_dynarmic{};
 };
 
 System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;
 
-CoreManager& System::CurrentCoreManager() {
-    return impl->CurrentCoreManager();
+CpuManager& System::GetCpuManager() {
+    return impl->cpu_manager;
+}
+
+const CpuManager& System::GetCpuManager() const {
+    return impl->cpu_manager;
 }
 
-const CoreManager& System::CurrentCoreManager() const {
-    return impl->CurrentCoreManager();
+System::ResultStatus System::Run() {
+    return impl->Run();
 }
 
-System::ResultStatus System::RunLoop(bool tight_loop) {
-    return impl->RunLoop(tight_loop);
+System::ResultStatus System::Pause() {
+    return impl->Pause();
 }
 
 System::ResultStatus System::SingleStep() {
-    return RunLoop(false);
+    return ResultStatus::Success;
 }
 
 void System::InvalidateCpuInstructionCaches() {
@@ -416,7 +443,7 @@ bool System::IsPoweredOn() const {
 }
 
 void System::PrepareReschedule() {
-    impl->CurrentPhysicalCore().Stop();
+    // Deprecated, does nothing, kept for backward compatibility.
 }
 
 void System::PrepareReschedule(const u32 core_index) {
@@ -436,31 +463,41 @@ const TelemetrySession& System::TelemetrySession() const {
 }
 
 ARM_Interface& System::CurrentArmInterface() {
-    return impl->CurrentPhysicalCore().ArmInterface();
+    return impl->kernel.CurrentScheduler().GetCurrentThread()->ArmInterface();
 }
 
 const ARM_Interface& System::CurrentArmInterface() const {
-    return impl->CurrentPhysicalCore().ArmInterface();
+    return impl->kernel.CurrentScheduler().GetCurrentThread()->ArmInterface();
 }
 
 std::size_t System::CurrentCoreIndex() const {
-    return impl->cpu_manager.GetActiveCoreIndex();
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    ASSERT(core < Core::Hardware::NUM_CPU_CORES);
+    return core;
 }
 
 Kernel::Scheduler& System::CurrentScheduler() {
-    return impl->CurrentPhysicalCore().Scheduler();
+    return impl->kernel.CurrentScheduler();
 }
 
 const Kernel::Scheduler& System::CurrentScheduler() const {
-    return impl->CurrentPhysicalCore().Scheduler();
+    return impl->kernel.CurrentScheduler();
+}
+
+Kernel::PhysicalCore& System::CurrentPhysicalCore() {
+    return impl->kernel.CurrentPhysicalCore();
+}
+
+const Kernel::PhysicalCore& System::CurrentPhysicalCore() const {
+    return impl->kernel.CurrentPhysicalCore();
 }
 
 Kernel::Scheduler& System::Scheduler(std::size_t core_index) {
-    return impl->GetPhysicalCore(core_index).Scheduler();
+    return impl->kernel.Scheduler(core_index);
 }
 
 const Kernel::Scheduler& System::Scheduler(std::size_t core_index) const {
-    return impl->GetPhysicalCore(core_index).Scheduler();
+    return impl->kernel.Scheduler(core_index);
 }
 
 /// Gets the global scheduler
@@ -490,20 +527,15 @@ const Kernel::Process* System::CurrentProcess() const {
 }
 
 ARM_Interface& System::ArmInterface(std::size_t core_index) {
-    return impl->GetPhysicalCore(core_index).ArmInterface();
+    auto* thread = impl->kernel.Scheduler(core_index).GetCurrentThread();
+    ASSERT(thread && !thread->IsHLEThread());
+    return thread->ArmInterface();
 }
 
 const ARM_Interface& System::ArmInterface(std::size_t core_index) const {
-    return impl->GetPhysicalCore(core_index).ArmInterface();
-}
-
-CoreManager& System::GetCoreManager(std::size_t core_index) {
-    return impl->cpu_manager.GetCoreManager(core_index);
-}
-
-const CoreManager& System::GetCoreManager(std::size_t core_index) const {
-    ASSERT(core_index < NUM_CPU_CORES);
-    return impl->cpu_manager.GetCoreManager(core_index);
+    auto* thread = impl->kernel.Scheduler(core_index).GetCurrentThread();
+    ASSERT(thread && !thread->IsHLEThread());
+    return thread->ArmInterface();
 }
 
 ExclusiveMonitor& System::Monitor() {
@@ -722,4 +754,18 @@ void System::RegisterHostThread() {
     impl->kernel.RegisterHostThread();
 }
 
+void System::EnterDynarmicProfile() {
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    impl->dynarmic_ticks[core] = MicroProfileEnter(impl->microprofile_dynarmic[core]);
+}
+
+void System::ExitDynarmicProfile() {
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    MicroProfileLeave(impl->microprofile_dynarmic[core], impl->dynarmic_ticks[core]);
+}
+
+bool System::IsMulticore() const {
+    return impl->is_multicore;
+}
+
 } // namespace Core
diff --git a/src/core/core.h b/src/core/core.h
index acc53d6a1..5c6cfbffe 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -27,6 +27,7 @@ class VfsFilesystem;
 namespace Kernel {
 class GlobalScheduler;
 class KernelCore;
+class PhysicalCore;
 class Process;
 class Scheduler;
 } // namespace Kernel
@@ -90,7 +91,7 @@ class InterruptManager;
 namespace Core {
 
 class ARM_Interface;
-class CoreManager;
+class CpuManager;
 class DeviceMemory;
 class ExclusiveMonitor;
 class FrameLimiter;
@@ -136,16 +137,16 @@ public:
     };
 
     /**
-     * Run the core CPU loop
-     * This function runs the core for the specified number of CPU instructions before trying to
-     * update hardware. This is much faster than SingleStep (and should be equivalent), as the CPU
-     * is not required to do a full dispatch with each instruction. NOTE: the number of instructions
-     * requested is not guaranteed to run, as this will be interrupted preemptively if a hardware
-     * update is requested (e.g. on a thread switch).
-     * @param tight_loop If false, the CPU single-steps.
-     * @return Result status, indicating whether or not the operation succeeded.
+     * Run the OS and Application
+     * This function will start emulation and run the relevant devices
+     */
+    ResultStatus Run();
+
+    /**
+     * Pause the OS and Application
+     * This function will pause emulation and stop the relevant devices
      */
-    ResultStatus RunLoop(bool tight_loop = true);
+    ResultStatus Pause();
 
     /**
      * Step the CPU one instruction
@@ -209,17 +210,21 @@ public:
     /// Gets the scheduler for the CPU core that is currently running
     const Kernel::Scheduler& CurrentScheduler() const;
 
+    /// Gets the physical core for the CPU core that is currently running
+    Kernel::PhysicalCore& CurrentPhysicalCore();
+
+    /// Gets the physical core for the CPU core that is currently running
+    const Kernel::PhysicalCore& CurrentPhysicalCore() const;
+
     /// Gets a reference to an ARM interface for the CPU core with the specified index
     ARM_Interface& ArmInterface(std::size_t core_index);
 
     /// Gets a const reference to an ARM interface from the CPU core with the specified index
     const ARM_Interface& ArmInterface(std::size_t core_index) const;
 
-    /// Gets a CPU interface to the CPU core with the specified index
-    CoreManager& GetCoreManager(std::size_t core_index);
+    CpuManager& GetCpuManager();
 
-    /// Gets a CPU interface to the CPU core with the specified index
-    const CoreManager& GetCoreManager(std::size_t core_index) const;
+    const CpuManager& GetCpuManager() const;
 
     /// Gets a reference to the exclusive monitor
     ExclusiveMonitor& Monitor();
@@ -370,14 +375,17 @@ public:
     /// Register a host thread as an auxiliary thread.
     void RegisterHostThread();
 
-private:
-    System();
+    /// Enter Dynarmic Microprofile
+    void EnterDynarmicProfile();
+
+    /// Exit Dynarmic Microprofile
+    void ExitDynarmicProfile();
 
-    /// Returns the currently running CPU core
-    CoreManager& CurrentCoreManager();
+    /// Tells if system is running on multicore.
+    bool IsMulticore() const;
 
-    /// Returns the currently running CPU core
-    const CoreManager& CurrentCoreManager() const;
+private:
+    System();
 
     /**
      * Initialize the emulated system.
diff --git a/src/core/core_manager.cpp b/src/core/core_manager.cpp
deleted file mode 100644
index b6b797c80..000000000
--- a/src/core/core_manager.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <condition_variable>
-#include <mutex>
-
-#include "common/logging/log.h"
-#include "core/arm/exclusive_monitor.h"
-#include "core/arm/unicorn/arm_unicorn.h"
-#include "core/core.h"
-#include "core/core_manager.h"
-#include "core/core_timing.h"
-#include "core/hle/kernel/kernel.h"
-#include "core/hle/kernel/physical_core.h"
-#include "core/hle/kernel/scheduler.h"
-#include "core/hle/kernel/thread.h"
-#include "core/hle/lock.h"
-#include "core/settings.h"
-
-namespace Core {
-
-CoreManager::CoreManager(System& system, std::size_t core_index)
-    : global_scheduler{system.GlobalScheduler()}, physical_core{system.Kernel().PhysicalCore(
-                                                      core_index)},
-      core_timing{system.CoreTiming()}, core_index{core_index} {}
-
-CoreManager::~CoreManager() = default;
-
-void CoreManager::RunLoop(bool tight_loop) {
-    Reschedule();
-
-    // If we don't have a currently active thread then don't execute instructions,
-    // instead advance to the next event and try to yield to the next thread
-    if (Kernel::GetCurrentThread() == nullptr) {
-        LOG_TRACE(Core, "Core-{} idling", core_index);
-        core_timing.Idle();
-    } else {
-        if (tight_loop) {
-            physical_core.Run();
-        } else {
-            physical_core.Step();
-        }
-    }
-    core_timing.Advance();
-
-    Reschedule();
-}
-
-void CoreManager::SingleStep() {
-    return RunLoop(false);
-}
-
-void CoreManager::PrepareReschedule() {
-    physical_core.Stop();
-}
-
-void CoreManager::Reschedule() {
-    // Lock the global kernel mutex when we manipulate the HLE state
-    std::lock_guard lock(HLE::g_hle_lock);
-
-    global_scheduler.SelectThread(core_index);
-
-    physical_core.Scheduler().TryDoContextSwitch();
-}
-
-} // namespace Core
diff --git a/src/core/core_manager.h b/src/core/core_manager.h
deleted file mode 100644
index d525de00a..000000000
--- a/src/core/core_manager.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include "common/common_types.h"
-
-namespace Kernel {
-class GlobalScheduler;
-class PhysicalCore;
-} // namespace Kernel
-
-namespace Core {
-class System;
-}
-
-namespace Core::Timing {
-class CoreTiming;
-}
-
-namespace Core::Memory {
-class Memory;
-}
-
-namespace Core {
-
-constexpr unsigned NUM_CPU_CORES{4};
-
-class CoreManager {
-public:
-    CoreManager(System& system, std::size_t core_index);
-    ~CoreManager();
-
-    void RunLoop(bool tight_loop = true);
-
-    void SingleStep();
-
-    void PrepareReschedule();
-
-    bool IsMainCore() const {
-        return core_index == 0;
-    }
-
-    std::size_t CoreIndex() const {
-        return core_index;
-    }
-
-private:
-    void Reschedule();
-
-    Kernel::GlobalScheduler& global_scheduler;
-    Kernel::PhysicalCore& physical_core;
-    Timing::CoreTiming& core_timing;
-
-    std::atomic<bool> reschedule_pending = false;
-    std::size_t core_index;
-};
-
-} // namespace Core
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 46d4178c4..a63e60461 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -1,29 +1,27 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include "core/core_timing.h"
-
 #include <algorithm>
 #include <mutex>
 #include <string>
 #include <tuple>
 
 #include "common/assert.h"
-#include "common/thread.h"
+#include "common/microprofile.h"
+#include "core/core_timing.h"
 #include "core/core_timing_util.h"
-#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
-constexpr int MAX_SLICE_LENGTH = 10000;
+constexpr u64 MAX_SLICE_LENGTH = 4000;
 
 std::shared_ptr<EventType> CreateEvent(std::string name, TimedCallback&& callback) {
     return std::make_shared<EventType>(std::move(callback), std::move(name));
 }
 
 struct CoreTiming::Event {
-    s64 time;
+    u64 time;
     u64 fifo_order;
     u64 userdata;
     std::weak_ptr<EventType> type;
@@ -39,51 +37,90 @@ struct CoreTiming::Event {
     }
 };
 
-CoreTiming::CoreTiming() = default;
-CoreTiming::~CoreTiming() = default;
+CoreTiming::CoreTiming() {
+    clock =
+        Common::CreateBestMatchingClock(Core::Hardware::BASE_CLOCK_RATE, Core::Hardware::CNTFREQ);
+}
 
-void CoreTiming::Initialize() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    slice_length = MAX_SLICE_LENGTH;
-    global_timer = 0;
-    idled_cycles = 0;
-    current_context = 0;
+CoreTiming::~CoreTiming() = default;
 
-    // The time between CoreTiming being initialized and the first call to Advance() is considered
-    // the slice boundary between slice -1 and slice 0. Dispatcher loops must call Advance() before
-    // executing the first cycle of each slice to prepare the slice length and downcount for
-    // that slice.
-    is_global_timer_sane = true;
+void CoreTiming::ThreadEntry(CoreTiming& instance) {
+    constexpr char name[] = "yuzu:HostTiming";
+    MicroProfileOnThreadCreate(name);
+    Common::SetCurrentThreadName(name);
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::VeryHigh);
+    instance.on_thread_init();
+    instance.ThreadLoop();
+}
 
+void CoreTiming::Initialize(std::function<void(void)>&& on_thread_init_) {
+    on_thread_init = std::move(on_thread_init_);
     event_fifo_id = 0;
-
+    shutting_down = false;
+    ticks = 0;
     const auto empty_timed_callback = [](u64, s64) {};
     ev_lost = CreateEvent("_lost_event", empty_timed_callback);
+    if (is_multicore) {
+        timer_thread = std::make_unique<std::thread>(ThreadEntry, std::ref(*this));
+    }
 }
 
 void CoreTiming::Shutdown() {
+    paused = true;
+    shutting_down = true;
+    pause_event.Set();
+    event.Set();
+    if (timer_thread) {
+        timer_thread->join();
+    }
     ClearPendingEvents();
+    timer_thread.reset();
+    has_started = false;
 }
 
-void CoreTiming::ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
-                               u64 userdata) {
-    std::lock_guard guard{inner_mutex};
-    const s64 timeout = GetTicks() + cycles_into_future;
+void CoreTiming::Pause(bool is_paused) {
+    paused = is_paused;
+    pause_event.Set();
+}
 
-    // If this event needs to be scheduled before the next advance(), force one early
-    if (!is_global_timer_sane) {
-        ForceExceptionCheck(cycles_into_future);
+void CoreTiming::SyncPause(bool is_paused) {
+    if (is_paused == paused && paused_set == paused) {
+        return;
+    }
+    Pause(is_paused);
+    if (timer_thread) {
+        if (!is_paused) {
+            pause_event.Set();
+        }
+        event.Set();
+        while (paused_set != is_paused)
+            ;
     }
+}
 
-    event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
+bool CoreTiming::IsRunning() const {
+    return !paused_set;
+}
 
-    std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+bool CoreTiming::HasPendingEvents() const {
+    return !(wait_set && event_queue.empty());
 }
 
-void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata) {
-    std::lock_guard guard{inner_mutex};
+void CoreTiming::ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
+                               u64 userdata) {
+    {
+        std::scoped_lock scope{basic_lock};
+        const u64 timeout = static_cast<u64>(GetGlobalTimeNs().count() + ns_into_future);
+
+        event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
 
+        std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+    }
+    event.Set();
+}
+
+void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata) {
+    std::scoped_lock scope{basic_lock};
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get() && e.userdata == userdata;
     });
@@ -95,21 +132,39 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u
     }
 }
 
-u64 CoreTiming::GetTicks() const {
-    u64 ticks = static_cast<u64>(global_timer);
-    if (!is_global_timer_sane) {
-        ticks += accumulated_ticks;
+void CoreTiming::AddTicks(u64 ticks) {
+    this->ticks += ticks;
+    downcount -= ticks;
+}
+
+void CoreTiming::Idle() {
+    if (!event_queue.empty()) {
+        const u64 next_event_time = event_queue.front().time;
+        const u64 next_ticks = nsToCycles(std::chrono::nanoseconds(next_event_time)) + 10U;
+        if (next_ticks > ticks) {
+            ticks = next_ticks;
+        }
+        return;
     }
-    return ticks;
+    ticks += 1000U;
 }
 
-u64 CoreTiming::GetIdleTicks() const {
-    return static_cast<u64>(idled_cycles);
+void CoreTiming::ResetTicks() {
+    downcount = MAX_SLICE_LENGTH;
 }
 
-void CoreTiming::AddTicks(u64 ticks) {
-    accumulated_ticks += ticks;
-    downcounts[current_context] -= static_cast<s64>(ticks);
+u64 CoreTiming::GetCPUTicks() const {
+    if (is_multicore) {
+        return clock->GetCPUCycles();
+    }
+    return ticks;
+}
+
+u64 CoreTiming::GetClockTicks() const {
+    if (is_multicore) {
+        return clock->GetClockCycles();
+    }
+    return CpuCyclesToClockCycles(ticks);
 }
 
 void CoreTiming::ClearPendingEvents() {
@@ -117,7 +172,7 @@ void CoreTiming::ClearPendingEvents() {
 }
 
 void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
-    std::lock_guard guard{inner_mutex};
+    std::scoped_lock lock{basic_lock};
 
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get();
@@ -130,97 +185,68 @@ void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
     }
 }
 
-void CoreTiming::ForceExceptionCheck(s64 cycles) {
-    cycles = std::max<s64>(0, cycles);
-    if (downcounts[current_context] <= cycles) {
-        return;
-    }
-
-    // downcount is always (much) smaller than MAX_INT so we can safely cast cycles to an int
-    // here. Account for cycles already executed by adjusting the g.slice_length
-    downcounts[current_context] = static_cast<int>(cycles);
-}
-
-std::optional<u64> CoreTiming::NextAvailableCore(const s64 needed_ticks) const {
-    const u64 original_context = current_context;
-    u64 next_context = (original_context + 1) % num_cpu_cores;
-    while (next_context != original_context) {
-        if (time_slice[next_context] >= needed_ticks) {
-            return {next_context};
-        } else if (time_slice[next_context] >= 0) {
-            return std::nullopt;
-        }
-        next_context = (next_context + 1) % num_cpu_cores;
-    }
-    return std::nullopt;
-}
-
-void CoreTiming::Advance() {
-    std::unique_lock<std::mutex> guard(inner_mutex);
-
-    const u64 cycles_executed = accumulated_ticks;
-    time_slice[current_context] = std::max<s64>(0, time_slice[current_context] - accumulated_ticks);
-    global_timer += cycles_executed;
-
-    is_global_timer_sane = true;
+std::optional<s64> CoreTiming::Advance() {
+    std::scoped_lock lock{advance_lock, basic_lock};
+    global_timer = GetGlobalTimeNs().count();
 
     while (!event_queue.empty() && event_queue.front().time <= global_timer) {
         Event evt = std::move(event_queue.front());
         std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
         event_queue.pop_back();
-        inner_mutex.unlock();
+        basic_lock.unlock();
 
         if (auto event_type{evt.type.lock()}) {
             event_type->callback(evt.userdata, global_timer - evt.time);
         }
 
-        inner_mutex.lock();
+        basic_lock.lock();
+        global_timer = GetGlobalTimeNs().count();
     }
 
-    is_global_timer_sane = false;
-
-    // Still events left (scheduled in the future)
     if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        const auto next_core = NextAvailableCore(needed_ticks);
-        if (next_core) {
-            downcounts[*next_core] = needed_ticks;
-        }
+        const s64 next_time = event_queue.front().time - global_timer;
+        return next_time;
+    } else {
+        return std::nullopt;
     }
-
-    accumulated_ticks = 0;
-
-    downcounts[current_context] = time_slice[current_context];
 }
 
-void CoreTiming::ResetRun() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    current_context = 0;
-    // Still events left (scheduled in the future)
-    if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        downcounts[current_context] = needed_ticks;
+void CoreTiming::ThreadLoop() {
+    has_started = true;
+    while (!shutting_down) {
+        while (!paused) {
+            paused_set = false;
+            const auto next_time = Advance();
+            if (next_time) {
+                if (*next_time > 0) {
+                    std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time);
+                    event.WaitFor(next_time_ns);
+                }
+            } else {
+                wait_set = true;
+                event.Wait();
+            }
+            wait_set = false;
+        }
+        paused_set = true;
+        clock->Pause(true);
+        pause_event.Wait();
+        clock->Pause(false);
     }
-
-    is_global_timer_sane = false;
-    accumulated_ticks = 0;
 }
 
-void CoreTiming::Idle() {
-    accumulated_ticks += downcounts[current_context];
-    idled_cycles += downcounts[current_context];
-    downcounts[current_context] = 0;
+std::chrono::nanoseconds CoreTiming::GetGlobalTimeNs() const {
+    if (is_multicore) {
+        return clock->GetTimeNS();
+    }
+    return CyclesToNs(ticks);
 }
 
 std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const {
-    return std::chrono::microseconds{GetTicks() * 1000000 / Hardware::BASE_CLOCK_RATE};
-}
-
-s64 CoreTiming::GetDowncount() const {
-    return downcounts[current_context];
+    if (is_multicore) {
+        return clock->GetTimeUS();
+    }
+    return CyclesToUs(ticks);
 }
 
 } // namespace Core::Timing
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index d50f4eb8a..72faaab64 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -1,19 +1,25 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
 
+#include <atomic>
 #include <chrono>
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <optional>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/spin_lock.h"
+#include "common/thread.h"
 #include "common/threadsafe_queue.h"
+#include "common/wall_clock.h"
+#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
@@ -56,16 +62,40 @@ public:
 
     /// CoreTiming begins at the boundary of timing slice -1. An initial call to Advance() is
     /// required to end slice - 1 and start slice 0 before the first cycle of code is executed.
-    void Initialize();
+    void Initialize(std::function<void(void)>&& on_thread_init_);
 
     /// Tears down all timing related functionality.
     void Shutdown();
 
-    /// After the first Advance, the slice lengths and the downcount will be reduced whenever an
-    /// event is scheduled earlier than the current values.
-    ///
-    /// Scheduling from a callback will not update the downcount until the Advance() completes.
-    void ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
+    /// Check if it's using host timing.
+    bool IsHostTiming() const {
+        return is_multicore;
+    }
+
+    /// Pauses/Unpauses the execution of the timer thread.
+    void Pause(bool is_paused);
+
+    /// Pauses/Unpauses the execution of the timer thread and waits until paused.
+    void SyncPause(bool is_paused);
+
+    /// Checks if core timing is running.
+    bool IsRunning() const;
+
+    /// Checks if the timer thread has started.
+    bool HasStarted() const {
+        return has_started;
+    }
+
+    /// Checks if there are any pending time events.
+    bool HasPendingEvents() const;
+
+    /// Schedules an event in core timing
+    void ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
                        u64 userdata = 0);
 
     void UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata);
@@ -73,41 +103,30 @@ public:
     /// We only permit one event of each type in the queue at a time.
     void RemoveEvent(const std::shared_ptr<EventType>& event_type);
 
-    void ForceExceptionCheck(s64 cycles);
-
-    /// This should only be called from the emu thread, if you are calling it any other thread,
-    /// you are doing something evil
-    u64 GetTicks() const;
-
-    u64 GetIdleTicks() const;
-
     void AddTicks(u64 ticks);
 
-    /// Advance must be called at the beginning of dispatcher loops, not the end. Advance() ends
-    /// the previous timing slice and begins the next one, you must Advance from the previous
-    /// slice to the current one before executing any cycles. CoreTiming starts in slice -1 so an
-    /// Advance() is required to initialize the slice length before the first cycle of emulated
-    /// instructions is executed.
-    void Advance();
+    void ResetTicks();
 
-    /// Pretend that the main CPU has executed enough cycles to reach the next event.
     void Idle();
 
-    std::chrono::microseconds GetGlobalTimeUs() const;
+    s64 GetDowncount() const {
+        return downcount;
+    }
 
-    void ResetRun();
+    /// Returns current time in emulated CPU cycles
+    u64 GetCPUTicks() const;
 
-    s64 GetDowncount() const;
+    /// Returns current time in emulated in Clock cycles
+    u64 GetClockTicks() const;
 
-    void SwitchContext(u64 new_context) {
-        current_context = new_context;
-    }
+    /// Returns current time in microseconds.
+    std::chrono::microseconds GetGlobalTimeUs() const;
 
-    bool CanCurrentContextRun() const {
-        return time_slice[current_context] > 0;
-    }
+    /// Returns current time in nanoseconds.
+    std::chrono::nanoseconds GetGlobalTimeNs() const;
 
-    std::optional<u64> NextAvailableCore(const s64 needed_ticks) const;
+    /// Checks for events manually and returns time in nanoseconds for next event, threadsafe.
+    std::optional<s64> Advance();
 
 private:
     struct Event;
@@ -115,21 +134,14 @@ private:
     /// Clear all pending events. This should ONLY be done on exit.
     void ClearPendingEvents();
 
-    static constexpr u64 num_cpu_cores = 4;
+    static void ThreadEntry(CoreTiming& instance);
+    void ThreadLoop();
 
-    s64 global_timer = 0;
-    s64 idled_cycles = 0;
-    s64 slice_length = 0;
-    u64 accumulated_ticks = 0;
-    std::array<s64, num_cpu_cores> downcounts{};
-    // Slice of time assigned to each core per run.
-    std::array<s64, num_cpu_cores> time_slice{};
-    u64 current_context = 0;
+    std::unique_ptr<Common::WallClock> clock;
 
-    // Are we in a function that has been called from Advance()
-    // If events are scheduled from a function that gets called from Advance(),
-    // don't change slice_length and downcount.
-    bool is_global_timer_sane = false;
+    u64 global_timer = 0;
+
+    std::chrono::nanoseconds start_point;
 
     // The queue is a min-heap using std::make_heap/push_heap/pop_heap.
     // We don't use std::priority_queue because we need to be able to serialize, unserialize and
@@ -139,8 +151,23 @@ private:
     u64 event_fifo_id = 0;
 
     std::shared_ptr<EventType> ev_lost;
-
-    std::mutex inner_mutex;
+    Common::Event event{};
+    Common::Event pause_event{};
+    Common::SpinLock basic_lock{};
+    Common::SpinLock advance_lock{};
+    std::unique_ptr<std::thread> timer_thread;
+    std::atomic<bool> paused{};
+    std::atomic<bool> paused_set{};
+    std::atomic<bool> wait_set{};
+    std::atomic<bool> shutting_down{};
+    std::atomic<bool> has_started{};
+    std::function<void(void)> on_thread_init{};
+
+    bool is_multicore{};
+
+    /// Cycle timing
+    u64 ticks{};
+    s64 downcount{};
 };
 
 /// Creates a core timing event with the given name and callback.
diff --git a/src/core/core_timing_util.cpp b/src/core/core_timing_util.cpp
index de50d3b14..aefc63663 100644
--- a/src/core/core_timing_util.cpp
+++ b/src/core/core_timing_util.cpp
@@ -38,15 +38,23 @@ s64 usToCycles(std::chrono::microseconds us) {
 }
 
 s64 nsToCycles(std::chrono::nanoseconds ns) {
-    if (static_cast<u64>(ns.count() / 1000000000) > MAX_VALUE_TO_MULTIPLY) {
-        LOG_ERROR(Core_Timing, "Integer overflow, use max value");
-        return std::numeric_limits<s64>::max();
-    }
-    if (static_cast<u64>(ns.count()) > MAX_VALUE_TO_MULTIPLY) {
-        LOG_DEBUG(Core_Timing, "Time very big, do rounding");
-        return Hardware::BASE_CLOCK_RATE * (ns.count() / 1000000000);
-    }
-    return (Hardware::BASE_CLOCK_RATE * ns.count()) / 1000000000;
+    const u128 temporal = Common::Multiply64Into128(ns.count(), Hardware::BASE_CLOCK_RATE);
+    return Common::Divide128On32(temporal, static_cast<u32>(1000000000)).first;
+}
+
+u64 msToClockCycles(std::chrono::milliseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000).first;
+}
+
+u64 usToClockCycles(std::chrono::microseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000000).first;
+}
+
+u64 nsToClockCycles(std::chrono::nanoseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000000000).first;
 }
 
 u64 CpuCyclesToClockCycles(u64 ticks) {
@@ -54,4 +62,22 @@ u64 CpuCyclesToClockCycles(u64 ticks) {
     return Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
 }
 
+std::chrono::milliseconds CyclesToMs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000);
+    u64 ms = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::milliseconds(ms);
+}
+
+std::chrono::nanoseconds CyclesToNs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000000000);
+    u64 ns = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::nanoseconds(ns);
+}
+
+std::chrono::microseconds CyclesToUs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000000);
+    u64 us = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::microseconds(us);
+}
+
 } // namespace Core::Timing
diff --git a/src/core/core_timing_util.h b/src/core/core_timing_util.h
index addc72b19..2ed979e14 100644
--- a/src/core/core_timing_util.h
+++ b/src/core/core_timing_util.h
@@ -13,18 +13,12 @@ namespace Core::Timing {
 s64 msToCycles(std::chrono::milliseconds ms);
 s64 usToCycles(std::chrono::microseconds us);
 s64 nsToCycles(std::chrono::nanoseconds ns);
-
-inline std::chrono::milliseconds CyclesToMs(s64 cycles) {
-    return std::chrono::milliseconds(cycles * 1000 / Hardware::BASE_CLOCK_RATE);
-}
-
-inline std::chrono::nanoseconds CyclesToNs(s64 cycles) {
-    return std::chrono::nanoseconds(cycles * 1000000000 / Hardware::BASE_CLOCK_RATE);
-}
-
-inline std::chrono::microseconds CyclesToUs(s64 cycles) {
-    return std::chrono::microseconds(cycles * 1000000 / Hardware::BASE_CLOCK_RATE);
-}
+u64 msToClockCycles(std::chrono::milliseconds ns);
+u64 usToClockCycles(std::chrono::microseconds ns);
+u64 nsToClockCycles(std::chrono::nanoseconds ns);
+std::chrono::milliseconds CyclesToMs(s64 cycles);
+std::chrono::nanoseconds CyclesToNs(s64 cycles);
+std::chrono::microseconds CyclesToUs(s64 cycles);
 
 u64 CpuCyclesToClockCycles(u64 ticks);
 
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
index 70ddbdcca..32afcf3ae 100644
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -2,80 +2,372 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/fiber.h"
+#include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/thread.h"
+#include "video_core/gpu.h"
 
 namespace Core {
 
 CpuManager::CpuManager(System& system) : system{system} {}
 CpuManager::~CpuManager() = default;
 
+void CpuManager::ThreadStart(CpuManager& cpu_manager, std::size_t core) {
+    cpu_manager.RunThread(core);
+}
+
 void CpuManager::Initialize() {
-    for (std::size_t index = 0; index < core_managers.size(); ++index) {
-        core_managers[index] = std::make_unique<CoreManager>(system, index);
+    running_mode = true;
+    if (is_multicore) {
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].host_thread =
+                std::make_unique<std::thread>(ThreadStart, std::ref(*this), core);
+        }
+    } else {
+        core_data[0].host_thread = std::make_unique<std::thread>(ThreadStart, std::ref(*this), 0);
     }
 }
 
 void CpuManager::Shutdown() {
-    for (auto& cpu_core : core_managers) {
-        cpu_core.reset();
+    running_mode = false;
+    Pause(false);
+    if (is_multicore) {
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].host_thread->join();
+            core_data[core].host_thread.reset();
+        }
+    } else {
+        core_data[0].host_thread->join();
+        core_data[0].host_thread.reset();
     }
 }
 
-CoreManager& CpuManager::GetCoreManager(std::size_t index) {
-    return *core_managers.at(index);
+std::function<void(void*)> CpuManager::GetGuestThreadStartFunc() {
+    return std::function<void(void*)>(GuestThreadFunction);
 }
 
-const CoreManager& CpuManager::GetCoreManager(std::size_t index) const {
-    return *core_managers.at(index);
+std::function<void(void*)> CpuManager::GetIdleThreadStartFunc() {
+    return std::function<void(void*)>(IdleThreadFunction);
 }
 
-CoreManager& CpuManager::GetCurrentCoreManager() {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+std::function<void(void*)> CpuManager::GetSuspendThreadStartFunc() {
+    return std::function<void(void*)>(SuspendThreadFunction);
 }
 
-const CoreManager& CpuManager::GetCurrentCoreManager() const {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+void CpuManager::GuestThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunGuestThread();
+    } else {
+        cpu_manager->SingleCoreRunGuestThread();
+    }
 }
 
-void CpuManager::RunLoop(bool tight_loop) {
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::HandlePacket();
+void CpuManager::GuestRewindFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunGuestLoop();
+    } else {
+        cpu_manager->SingleCoreRunGuestLoop();
+    }
+}
 
-        // If the loop is halted and we want to step, use a tiny (1) number of instructions to
-        // execute. Otherwise, get out of the loop function.
-        if (GDBStub::GetCpuHaltFlag()) {
-            if (GDBStub::GetCpuStepFlag()) {
-                tight_loop = false;
-            } else {
-                return;
+void CpuManager::IdleThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunIdleThread();
+    } else {
+        cpu_manager->SingleCoreRunIdleThread();
+    }
+}
+
+void CpuManager::SuspendThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunSuspendThread();
+    } else {
+        cpu_manager->SingleCoreRunSuspendThread();
+    }
+}
+
+void* CpuManager::GetStartFuncParamater() {
+    return static_cast<void*>(this);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///                             MultiCore                                   ///
+///////////////////////////////////////////////////////////////////////////////
+
+void CpuManager::MultiCoreRunGuestThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    MultiCoreRunGuestLoop();
+}
+
+void CpuManager::MultiCoreRunGuestLoop() {
+    auto& kernel = system.Kernel();
+    auto* thread = kernel.CurrentScheduler().GetCurrentThread();
+    while (true) {
+        auto* physical_core = &kernel.CurrentPhysicalCore();
+        auto& arm_interface = thread->ArmInterface();
+        system.EnterDynarmicProfile();
+        while (!physical_core->IsInterrupted()) {
+            arm_interface.Run();
+            physical_core = &kernel.CurrentPhysicalCore();
+        }
+        system.ExitDynarmicProfile();
+        arm_interface.ClearExclusiveState();
+        auto& scheduler = kernel.CurrentScheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCoreRunIdleThread() {
+    auto& kernel = system.Kernel();
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        physical_core.Idle();
+        auto& scheduler = kernel.CurrentScheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCoreRunSuspendThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto core = kernel.GetCurrentHostThreadID();
+        auto& scheduler = kernel.CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        Common::Fiber::YieldTo(current_thread->GetHostContext(), core_data[core].host_context);
+        ASSERT(scheduler.ContextSwitchPending());
+        ASSERT(core == kernel.GetCurrentHostThreadID());
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCorePause(bool paused) {
+    if (!paused) {
+        bool all_not_barrier = false;
+        while (!all_not_barrier) {
+            all_not_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_not_barrier &=
+                    !core_data[core].is_running.load() && core_data[core].initialized.load();
+            }
+        }
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].enter_barrier->Set();
+        }
+        if (paused_state.load()) {
+            bool all_barrier = false;
+            while (!all_barrier) {
+                all_barrier = true;
+                for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                    all_barrier &=
+                        core_data[core].is_paused.load() && core_data[core].initialized.load();
+                }
+            }
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                core_data[core].exit_barrier->Set();
+            }
+        }
+    } else {
+        /// Wait until all cores are paused.
+        bool all_barrier = false;
+        while (!all_barrier) {
+            all_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_barrier &=
+                    core_data[core].is_paused.load() && core_data[core].initialized.load();
             }
         }
+        /// Don't release the barrier
     }
+    paused_state = paused;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///                             SingleCore                                   ///
+///////////////////////////////////////////////////////////////////////////////
 
-    auto& core_timing = system.CoreTiming();
-    core_timing.ResetRun();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (active_core = 0; active_core < NUM_CPU_CORES; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                core_managers[active_core]->RunLoop(tight_loop);
+void CpuManager::SingleCoreRunGuestThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    SingleCoreRunGuestLoop();
+}
+
+void CpuManager::SingleCoreRunGuestLoop() {
+    auto& kernel = system.Kernel();
+    auto* thread = kernel.CurrentScheduler().GetCurrentThread();
+    while (true) {
+        auto* physical_core = &kernel.CurrentPhysicalCore();
+        auto& arm_interface = thread->ArmInterface();
+        system.EnterDynarmicProfile();
+        if (!physical_core->IsInterrupted()) {
+            arm_interface.Run();
+            physical_core = &kernel.CurrentPhysicalCore();
+        }
+        system.ExitDynarmicProfile();
+        thread->SetPhantomMode(true);
+        system.CoreTiming().Advance();
+        thread->SetPhantomMode(false);
+        arm_interface.ClearExclusiveState();
+        PreemptSingleCore();
+        auto& scheduler = kernel.Scheduler(current_core);
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::SingleCoreRunIdleThread() {
+    auto& kernel = system.Kernel();
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        PreemptSingleCore(false);
+        system.CoreTiming().AddTicks(1000U);
+        idle_count++;
+        auto& scheduler = physical_core.Scheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::SingleCoreRunSuspendThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto core = kernel.GetCurrentHostThreadID();
+        auto& scheduler = kernel.CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        Common::Fiber::YieldTo(current_thread->GetHostContext(), core_data[0].host_context);
+        ASSERT(scheduler.ContextSwitchPending());
+        ASSERT(core == kernel.GetCurrentHostThreadID());
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::PreemptSingleCore(bool from_running_enviroment) {
+    std::size_t old_core = current_core;
+    auto& scheduler = system.Kernel().Scheduler(old_core);
+    Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+    if (idle_count >= 4 || from_running_enviroment) {
+        if (!from_running_enviroment) {
+            system.CoreTiming().Idle();
+            idle_count = 0;
+        }
+        current_thread->SetPhantomMode(true);
+        system.CoreTiming().Advance();
+        current_thread->SetPhantomMode(false);
+    }
+    current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES);
+    system.CoreTiming().ResetTicks();
+    scheduler.Unload();
+    auto& next_scheduler = system.Kernel().Scheduler(current_core);
+    Common::Fiber::YieldTo(current_thread->GetHostContext(), next_scheduler.ControlContext());
+    /// May have changed scheduler
+    auto& current_scheduler = system.Kernel().Scheduler(current_core);
+    current_scheduler.Reload();
+    auto* currrent_thread2 = current_scheduler.GetCurrentThread();
+    if (!currrent_thread2->IsIdleThread()) {
+        idle_count = 0;
+    }
+}
+
+void CpuManager::SingleCorePause(bool paused) {
+    if (!paused) {
+        bool all_not_barrier = false;
+        while (!all_not_barrier) {
+            all_not_barrier = !core_data[0].is_running.load() && core_data[0].initialized.load();
+        }
+        core_data[0].enter_barrier->Set();
+        if (paused_state.load()) {
+            bool all_barrier = false;
+            while (!all_barrier) {
+                all_barrier = core_data[0].is_paused.load() && core_data[0].initialized.load();
             }
-            keep_running |= core_timing.CanCurrentContextRun();
+            core_data[0].exit_barrier->Set();
         }
-    } while (keep_running);
+    } else {
+        /// Wait until all cores are paused.
+        bool all_barrier = false;
+        while (!all_barrier) {
+            all_barrier = core_data[0].is_paused.load() && core_data[0].initialized.load();
+        }
+        /// Don't release the barrier
+    }
+    paused_state = paused;
+}
+
+void CpuManager::Pause(bool paused) {
+    if (is_multicore) {
+        MultiCorePause(paused);
+    } else {
+        SingleCorePause(paused);
+    }
+}
 
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::SetCpuStepFlag(false);
+void CpuManager::RunThread(std::size_t core) {
+    /// Initialization
+    system.RegisterCoreThread(core);
+    std::string name;
+    if (is_multicore) {
+        name = "yuzu:CoreCPUThread_" + std::to_string(core);
+    } else {
+        name = "yuzu:CPUThread";
+    }
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    auto& data = core_data[core];
+    data.enter_barrier = std::make_unique<Common::Event>();
+    data.exit_barrier = std::make_unique<Common::Event>();
+    data.host_context = Common::Fiber::ThreadToFiber();
+    data.is_running = false;
+    data.initialized = true;
+    const bool sc_sync = !is_async_gpu && !is_multicore;
+    bool sc_sync_first_use = sc_sync;
+    /// Running
+    while (running_mode) {
+        data.is_running = false;
+        data.enter_barrier->Wait();
+        if (sc_sync_first_use) {
+            system.GPU().ObtainContext();
+            sc_sync_first_use = false;
+        }
+        auto& scheduler = system.Kernel().CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        data.is_running = true;
+        Common::Fiber::YieldTo(data.host_context, current_thread->GetHostContext());
+        data.is_running = false;
+        data.is_paused = true;
+        data.exit_barrier->Wait();
+        data.is_paused = false;
     }
+    /// Time to cleanup
+    data.host_context->Exit();
+    data.enter_barrier.reset();
+    data.exit_barrier.reset();
+    data.initialized = false;
 }
 
 } // namespace Core
diff --git a/src/core/cpu_manager.h b/src/core/cpu_manager.h
index 97554d1bb..35929ed94 100644
--- a/src/core/cpu_manager.h
+++ b/src/core/cpu_manager.h
@@ -5,12 +5,19 @@
 #pragma once
 
 #include <array>
+#include <atomic>
+#include <functional>
 #include <memory>
+#include <thread>
 #include "core/hardware_properties.h"
 
+namespace Common {
+class Event;
+class Fiber;
+} // namespace Common
+
 namespace Core {
 
-class CoreManager;
 class System;
 
 class CpuManager {
@@ -24,24 +31,75 @@ public:
     CpuManager& operator=(const CpuManager&) = delete;
     CpuManager& operator=(CpuManager&&) = delete;
 
+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
+    /// Sets if emulation is using an asynchronous GPU.
+    void SetAsyncGpu(bool is_async_gpu) {
+        this->is_async_gpu = is_async_gpu;
+    }
+
     void Initialize();
     void Shutdown();
 
-    CoreManager& GetCoreManager(std::size_t index);
-    const CoreManager& GetCoreManager(std::size_t index) const;
+    void Pause(bool paused);
 
-    CoreManager& GetCurrentCoreManager();
-    const CoreManager& GetCurrentCoreManager() const;
+    std::function<void(void*)> GetGuestThreadStartFunc();
+    std::function<void(void*)> GetIdleThreadStartFunc();
+    std::function<void(void*)> GetSuspendThreadStartFunc();
+    void* GetStartFuncParamater();
 
-    std::size_t GetActiveCoreIndex() const {
-        return active_core;
-    }
+    void PreemptSingleCore(bool from_running_enviroment = true);
 
-    void RunLoop(bool tight_loop);
+    std::size_t CurrentCore() const {
+        return current_core.load();
+    }
 
 private:
-    std::array<std::unique_ptr<CoreManager>, Hardware::NUM_CPU_CORES> core_managers;
-    std::size_t active_core{}; ///< Active core, only used in single thread mode
+    static void GuestThreadFunction(void* cpu_manager);
+    static void GuestRewindFunction(void* cpu_manager);
+    static void IdleThreadFunction(void* cpu_manager);
+    static void SuspendThreadFunction(void* cpu_manager);
+
+    void MultiCoreRunGuestThread();
+    void MultiCoreRunGuestLoop();
+    void MultiCoreRunIdleThread();
+    void MultiCoreRunSuspendThread();
+    void MultiCorePause(bool paused);
+
+    void SingleCoreRunGuestThread();
+    void SingleCoreRunGuestLoop();
+    void SingleCoreRunIdleThread();
+    void SingleCoreRunSuspendThread();
+    void SingleCorePause(bool paused);
+
+    static void ThreadStart(CpuManager& cpu_manager, std::size_t core);
+
+    void RunThread(std::size_t core);
+
+    struct CoreData {
+        std::shared_ptr<Common::Fiber> host_context;
+        std::unique_ptr<Common::Event> enter_barrier;
+        std::unique_ptr<Common::Event> exit_barrier;
+        std::atomic<bool> is_running;
+        std::atomic<bool> is_paused;
+        std::atomic<bool> initialized;
+        std::unique_ptr<std::thread> host_thread;
+    };
+
+    std::atomic<bool> running_mode{};
+    std::atomic<bool> paused_state{};
+
+    std::array<CoreData, Core::Hardware::NUM_CPU_CORES> core_data{};
+
+    bool is_async_gpu{};
+    bool is_multicore{};
+    std::atomic<std::size_t> current_core{};
+    std::size_t preemption_count{};
+    std::size_t idle_count{};
+    static constexpr std::size_t max_cycle_runs = 5;
 
     System& system;
 };
diff --git a/src/core/crypto/key_manager.cpp b/src/core/crypto/key_manager.cpp
index 8997c7082..f87fe0abc 100644
--- a/src/core/crypto/key_manager.cpp
+++ b/src/core/crypto/key_manager.cpp
@@ -695,8 +695,9 @@ void KeyManager::WriteKeyToFile(KeyCategory category, std::string_view keyname,
 }
 
 void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
-    if (s128_keys.find({id, field1, field2}) != s128_keys.end())
+    if (s128_keys.find({id, field1, field2}) != s128_keys.end() || key == Key128{}) {
         return;
+    }
     if (id == S128KeyType::Titlekey) {
         Key128 rights_id;
         std::memcpy(rights_id.data(), &field2, sizeof(u64));
@@ -716,8 +717,9 @@ void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
             return std::tie(elem.second.type, elem.second.field1, elem.second.field2) ==
                    std::tie(id, field1, field2);
         });
-    if (iter2 != s128_file_id.end())
+    if (iter2 != s128_file_id.end()) {
         WriteKeyToFile(category, iter2->first, key);
+    }
 
     // Variable cases
     if (id == S128KeyType::KeyArea) {
@@ -745,16 +747,18 @@ void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
 }
 
 void KeyManager::SetKey(S256KeyType id, Key256 key, u64 field1, u64 field2) {
-    if (s256_keys.find({id, field1, field2}) != s256_keys.end())
+    if (s256_keys.find({id, field1, field2}) != s256_keys.end() || key == Key256{}) {
         return;
+    }
     const auto iter = std::find_if(
         s256_file_id.begin(), s256_file_id.end(),
         [&id, &field1, &field2](const std::pair<std::string, KeyIndex<S256KeyType>> elem) {
             return std::tie(elem.second.type, elem.second.field1, elem.second.field2) ==
                    std::tie(id, field1, field2);
         });
-    if (iter != s256_file_id.end())
+    if (iter != s256_file_id.end()) {
         WriteKeyToFile(KeyCategory::Standard, iter->first, key);
+    }
     s256_keys[{id, field1, field2}] = key;
 }
 
diff --git a/src/core/crypto/key_manager.h b/src/core/crypto/key_manager.h
index 7265c4171..9269a73f2 100644
--- a/src/core/crypto/key_manager.h
+++ b/src/core/crypto/key_manager.h
@@ -223,7 +223,16 @@ bool operator<(const KeyIndex<KeyType>& lhs, const KeyIndex<KeyType>& rhs) {
 
 class KeyManager {
 public:
-    KeyManager();
+    static KeyManager& Instance() {
+        static KeyManager instance;
+        return instance;
+    }
+
+    KeyManager(const KeyManager&) = delete;
+    KeyManager& operator=(const KeyManager&) = delete;
+
+    KeyManager(KeyManager&&) = delete;
+    KeyManager& operator=(KeyManager&&) = delete;
 
     bool HasKey(S128KeyType id, u64 field1 = 0, u64 field2 = 0) const;
     bool HasKey(S256KeyType id, u64 field1 = 0, u64 field2 = 0) const;
@@ -257,6 +266,8 @@ public:
     bool AddTicketPersonalized(Ticket raw);
 
 private:
+    KeyManager();
+
     std::map<KeyIndex<S128KeyType>, Key128> s128_keys;
     std::map<KeyIndex<S256KeyType>, Key256> s256_keys;
 
diff --git a/src/core/file_sys/bis_factory.cpp b/src/core/file_sys/bis_factory.cpp
index 0af44f340..8935a62c3 100644
--- a/src/core/file_sys/bis_factory.cpp
+++ b/src/core/file_sys/bis_factory.cpp
@@ -79,7 +79,7 @@ VirtualDir BISFactory::OpenPartition(BisPartitionId id) const {
 }
 
 VirtualFile BISFactory::OpenPartitionStorage(BisPartitionId id) const {
-    Core::Crypto::KeyManager keys;
+    auto& keys = Core::Crypto::KeyManager::Instance();
     Core::Crypto::PartitionDataManager pdm{
         Core::System::GetInstance().GetFilesystem()->OpenDirectory(
             FileUtil::GetUserPath(FileUtil::UserPath::SysDataDir), Mode::Read)};
diff --git a/src/core/file_sys/card_image.cpp b/src/core/file_sys/card_image.cpp
index 07d0c8d5d..664a47e7f 100644
--- a/src/core/file_sys/card_image.cpp
+++ b/src/core/file_sys/card_image.cpp
@@ -178,7 +178,7 @@ u32 XCI::GetSystemUpdateVersion() {
         return 0;
 
     for (const auto& file : update->GetFiles()) {
-        NCA nca{file, nullptr, 0, keys};
+        NCA nca{file, nullptr, 0};
 
         if (nca.GetStatus() != Loader::ResultStatus::Success)
             continue;
@@ -286,7 +286,7 @@ Loader::ResultStatus XCI::AddNCAFromPartition(XCIPartition part) {
             continue;
         }
 
-        auto nca = std::make_shared<NCA>(file, nullptr, 0, keys);
+        auto nca = std::make_shared<NCA>(file, nullptr, 0);
         if (nca->IsUpdate()) {
             continue;
         }
diff --git a/src/core/file_sys/card_image.h b/src/core/file_sys/card_image.h
index c2ee0ea99..e1b136426 100644
--- a/src/core/file_sys/card_image.h
+++ b/src/core/file_sys/card_image.h
@@ -140,6 +140,6 @@ private:
 
     u64 update_normal_partition_end;
 
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 } // namespace FileSys
diff --git a/src/core/file_sys/content_archive.cpp b/src/core/file_sys/content_archive.cpp
index b8bbdd1ef..473245d5a 100644
--- a/src/core/file_sys/content_archive.cpp
+++ b/src/core/file_sys/content_archive.cpp
@@ -118,9 +118,8 @@ static bool IsValidNCA(const NCAHeader& header) {
     return header.magic == Common::MakeMagic('N', 'C', 'A', '3');
 }
 
-NCA::NCA(VirtualFile file_, VirtualFile bktr_base_romfs_, u64 bktr_base_ivfc_offset,
-         Core::Crypto::KeyManager keys_)
-    : file(std::move(file_)), bktr_base_romfs(std::move(bktr_base_romfs_)), keys(std::move(keys_)) {
+NCA::NCA(VirtualFile file_, VirtualFile bktr_base_romfs_, u64 bktr_base_ivfc_offset)
+    : file(std::move(file_)), bktr_base_romfs(std::move(bktr_base_romfs_)) {
     if (file == nullptr) {
         status = Loader::ResultStatus::ErrorNullFile;
         return;
diff --git a/src/core/file_sys/content_archive.h b/src/core/file_sys/content_archive.h
index e249079b5..d25cbcf91 100644
--- a/src/core/file_sys/content_archive.h
+++ b/src/core/file_sys/content_archive.h
@@ -99,8 +99,7 @@ inline bool IsDirectoryLogoPartition(const VirtualDir& pfs) {
 class NCA : public ReadOnlyVfsDirectory {
 public:
     explicit NCA(VirtualFile file, VirtualFile bktr_base_romfs = nullptr,
-                 u64 bktr_base_ivfc_offset = 0,
-                 Core::Crypto::KeyManager keys = Core::Crypto::KeyManager());
+                 u64 bktr_base_ivfc_offset = 0);
     ~NCA() override;
 
     Loader::ResultStatus GetStatus() const;
@@ -159,7 +158,7 @@ private:
     bool encrypted = false;
     bool is_update = false;
 
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 
 } // namespace FileSys
diff --git a/src/core/file_sys/registered_cache.cpp b/src/core/file_sys/registered_cache.cpp
index ba5f76288..27c1b0233 100644
--- a/src/core/file_sys/registered_cache.cpp
+++ b/src/core/file_sys/registered_cache.cpp
@@ -408,7 +408,7 @@ void RegisteredCache::ProcessFiles(const std::vector<NcaID>& ids) {
 
         if (file == nullptr)
             continue;
-        const auto nca = std::make_shared<NCA>(parser(file, id), nullptr, 0, keys);
+        const auto nca = std::make_shared<NCA>(parser(file, id), nullptr, 0);
         if (nca->GetStatus() != Loader::ResultStatus::Success ||
             nca->GetType() != NCAContentType::Meta) {
             continue;
@@ -486,7 +486,7 @@ std::unique_ptr<NCA> RegisteredCache::GetEntry(u64 title_id, ContentRecordType t
     const auto raw = GetEntryRaw(title_id, type);
     if (raw == nullptr)
         return nullptr;
-    return std::make_unique<NCA>(raw, nullptr, 0, keys);
+    return std::make_unique<NCA>(raw, nullptr, 0);
 }
 
 template <typename T>
@@ -865,7 +865,7 @@ std::unique_ptr<NCA> ManualContentProvider::GetEntry(u64 title_id, ContentRecord
     const auto res = GetEntryRaw(title_id, type);
     if (res == nullptr)
         return nullptr;
-    return std::make_unique<NCA>(res, nullptr, 0, keys);
+    return std::make_unique<NCA>(res, nullptr, 0);
 }
 
 std::vector<ContentProviderEntry> ManualContentProvider::ListEntriesFilter(
diff --git a/src/core/file_sys/registered_cache.h b/src/core/file_sys/registered_cache.h
index d1eec240e..f339cd17b 100644
--- a/src/core/file_sys/registered_cache.h
+++ b/src/core/file_sys/registered_cache.h
@@ -88,7 +88,7 @@ public:
 
 protected:
     // A single instance of KeyManager to be used by GetEntry()
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 
 class PlaceholderCache {
diff --git a/src/core/file_sys/submission_package.cpp b/src/core/file_sys/submission_package.cpp
index ef3084681..175a8266a 100644
--- a/src/core/file_sys/submission_package.cpp
+++ b/src/core/file_sys/submission_package.cpp
@@ -21,7 +21,7 @@
 namespace FileSys {
 namespace {
 void SetTicketKeys(const std::vector<VirtualFile>& files) {
-    Core::Crypto::KeyManager keys;
+    auto& keys = Core::Crypto::KeyManager::Instance();
 
     for (const auto& ticket_file : files) {
         if (ticket_file == nullptr) {
@@ -285,7 +285,7 @@ void NSP::ReadNCAs(const std::vector<VirtualFile>& files) {
                     continue;
                 }
 
-                auto next_nca = std::make_shared<NCA>(std::move(next_file), nullptr, 0, keys);
+                auto next_nca = std::make_shared<NCA>(std::move(next_file), nullptr, 0);
                 if (next_nca->GetType() == NCAContentType::Program) {
                     program_status[cnmt.GetTitleID()] = next_nca->GetStatus();
                 }
diff --git a/src/core/file_sys/submission_package.h b/src/core/file_sys/submission_package.h
index ee9b6ce17..cf89de6a9 100644
--- a/src/core/file_sys/submission_package.h
+++ b/src/core/file_sys/submission_package.h
@@ -73,7 +73,7 @@ private:
     std::map<u64, std::map<std::pair<TitleType, ContentRecordType>, std::shared_ptr<NCA>>> ncas;
     std::vector<VirtualFile> ticket_files;
 
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 
     VirtualFile romfs;
     VirtualDir exefs;
diff --git a/src/core/file_sys/system_archive/mii_model.cpp b/src/core/file_sys/system_archive/mii_model.cpp
index 6a9add87c..61bb67945 100644
--- a/src/core/file_sys/system_archive/mii_model.cpp
+++ b/src/core/file_sys/system_archive/mii_model.cpp
@@ -40,7 +40,7 @@ VirtualDir MiiModel() {
     out->AddFile(std::make_shared<ArrayVfsFile<MiiModelData::SHAPE_MID.size()>>(
         MiiModelData::SHAPE_MID, "ShapeMid.dat"));
 
-    return std::move(out);
+    return out;
 }
 
 } // namespace FileSys::SystemArchive
diff --git a/src/core/file_sys/system_archive/shared_font.cpp b/src/core/file_sys/system_archive/shared_font.cpp
index 2c05eb42e..c5cdf7d9b 100644
--- a/src/core/file_sys/system_archive/shared_font.cpp
+++ b/src/core/file_sys/system_archive/shared_font.cpp
@@ -23,7 +23,7 @@ VirtualFile PackBFTTF(const std::array<u8, Size>& data, const std::string& name)
 
     std::vector<u8> bfttf(Size + sizeof(u64));
 
-    u64 offset = 0;
+    size_t offset = 0;
     Service::NS::EncryptSharedFont(vec, bfttf, offset);
     return std::make_shared<VectorVfsFile>(std::move(bfttf), name);
 }
diff --git a/src/core/file_sys/xts_archive.h b/src/core/file_sys/xts_archive.h
index 7704dee90..563531bb6 100644
--- a/src/core/file_sys/xts_archive.h
+++ b/src/core/file_sys/xts_archive.h
@@ -62,6 +62,6 @@ private:
 
     VirtualFile dec_file;
 
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 } // namespace FileSys
diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp
index d0c43447c..c1fbc235b 100644
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -29,7 +29,7 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) {
 
     const float window_aspect_ratio = static_cast<float>(height) / width;
     const float emulation_aspect_ratio = EmulationAspectRatio(
-        static_cast<AspectRatio>(Settings::values.aspect_ratio), window_aspect_ratio);
+        static_cast<AspectRatio>(Settings::values.aspect_ratio.GetValue()), window_aspect_ratio);
 
     const Common::Rectangle<u32> screen_window_area{0, 0, width, height};
     Common::Rectangle<u32> screen = MaxRectangle(screen_window_area, emulation_aspect_ratio);
diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp
index 70c0f8b80..79f22a403 100644
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -35,7 +35,6 @@
 #include "common/swap.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/memory/page_table.h"
 #include "core/hle/kernel/process.h"
diff --git a/src/core/hardware_properties.h b/src/core/hardware_properties.h
index b04e046ed..456b41e1b 100644
--- a/src/core/hardware_properties.h
+++ b/src/core/hardware_properties.h
@@ -42,6 +42,10 @@ struct EmuThreadHandle {
         constexpr u32 invalid_handle = 0xFFFFFFFF;
         return {invalid_handle, invalid_handle};
     }
+
+    bool IsInvalid() const {
+        return (*this) == InvalidHandle();
+    }
 };
 
 } // namespace Core
diff --git a/src/core/hle/kernel/address_arbiter.cpp b/src/core/hle/kernel/address_arbiter.cpp
index 8475b698c..4d2a9b35d 100644
--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -7,11 +7,15 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/handle_table.h"
+#include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
 
@@ -20,6 +24,7 @@ namespace Kernel {
 // Wake up num_to_wake (or all) threads in a vector.
 void AddressArbiter::WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads,
                                  s32 num_to_wake) {
+    auto& time_manager = system.Kernel().TimeManager();
     // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
     // them all.
     std::size_t last = waiting_threads.size();
@@ -29,12 +34,10 @@ void AddressArbiter::WakeThreads(const std::vector<std::shared_ptr<Thread>>& wai
 
     // Signal the waiting threads.
     for (std::size_t i = 0; i < last; i++) {
-        ASSERT(waiting_threads[i]->GetStatus() == ThreadStatus::WaitArb);
-        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
+        waiting_threads[i]->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
         RemoveThread(waiting_threads[i]);
-        waiting_threads[i]->SetArbiterWaitAddress(0);
+        waiting_threads[i]->WaitForArbitration(false);
         waiting_threads[i]->ResumeFromWait();
-        system.PrepareReschedule(waiting_threads[i]->GetProcessorID());
     }
 }
 
@@ -56,6 +59,7 @@ ResultCode AddressArbiter::SignalToAddress(VAddr address, SignalType type, s32 v
 }
 
 ResultCode AddressArbiter::SignalToAddressOnly(VAddr address, s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
     const std::vector<std::shared_ptr<Thread>> waiting_threads =
         GetThreadsWaitingOnAddress(address);
     WakeThreads(waiting_threads, num_to_wake);
@@ -64,6 +68,7 @@ ResultCode AddressArbiter::SignalToAddressOnly(VAddr address, s32 num_to_wake) {
 
 ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32 value,
                                                               s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
     auto& memory = system.Memory();
 
     // Ensure that we can write to the address.
@@ -71,16 +76,24 @@ ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32
         return ERR_INVALID_ADDRESS_STATE;
     }
 
-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
+    const std::size_t current_core = system.CurrentCoreIndex();
+    auto& monitor = system.Monitor();
+    u32 current_value;
+    do {
+        current_value = monitor.ExclusiveRead32(current_core, address);
+
+        if (current_value != value) {
+            return ERR_INVALID_STATE;
+        }
+        current_value++;
+    } while (!monitor.ExclusiveWrite32(current_core, address, current_value));
 
-    memory.Write32(address, static_cast<u32>(value + 1));
     return SignalToAddressOnly(address, num_to_wake);
 }
 
 ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
                                                                          s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
     auto& memory = system.Memory();
 
     // Ensure that we can write to the address.
@@ -92,29 +105,33 @@ ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr a
     const std::vector<std::shared_ptr<Thread>> waiting_threads =
         GetThreadsWaitingOnAddress(address);
 
-    // Determine the modified value depending on the waiting count.
+    const std::size_t current_core = system.CurrentCoreIndex();
+    auto& monitor = system.Monitor();
     s32 updated_value;
-    if (num_to_wake <= 0) {
-        if (waiting_threads.empty()) {
-            updated_value = value + 1;
-        } else {
-            updated_value = value - 1;
+    do {
+        updated_value = monitor.ExclusiveRead32(current_core, address);
+
+        if (updated_value != value) {
+            return ERR_INVALID_STATE;
         }
-    } else {
-        if (waiting_threads.empty()) {
-            updated_value = value + 1;
-        } else if (waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
-            updated_value = value - 1;
+        // Determine the modified value depending on the waiting count.
+        if (num_to_wake <= 0) {
+            if (waiting_threads.empty()) {
+                updated_value = value + 1;
+            } else {
+                updated_value = value - 1;
+            }
         } else {
-            updated_value = value;
+            if (waiting_threads.empty()) {
+                updated_value = value + 1;
+            } else if (waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
+                updated_value = value - 1;
+            } else {
+                updated_value = value;
+            }
         }
-    }
+    } while (!monitor.ExclusiveWrite32(current_core, address, updated_value));
 
-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
-
-    memory.Write32(address, static_cast<u32>(updated_value));
     WakeThreads(waiting_threads, num_to_wake);
     return RESULT_SUCCESS;
 }
@@ -136,60 +153,127 @@ ResultCode AddressArbiter::WaitForAddress(VAddr address, ArbitrationType type, s
 ResultCode AddressArbiter::WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
                                                     bool should_decrement) {
     auto& memory = system.Memory();
+    auto& kernel = system.Kernel();
+    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
 
-    // Ensure that we can read the address.
-    if (!memory.IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
+    Handle event_handle = InvalidHandle;
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, current_thread, timeout);
+
+        if (current_thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return ERR_THREAD_TERMINATING;
+        }
+
+        // Ensure that we can read the address.
+        if (!memory.IsValidVirtualAddress(address)) {
+            lock.CancelSleep();
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        s32 current_value = static_cast<s32>(memory.Read32(address));
+        if (current_value >= value) {
+            lock.CancelSleep();
+            return ERR_INVALID_STATE;
+        }
+
+        current_thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+
+        s32 decrement_value;
+
+        const std::size_t current_core = system.CurrentCoreIndex();
+        auto& monitor = system.Monitor();
+        do {
+            current_value = static_cast<s32>(monitor.ExclusiveRead32(current_core, address));
+            if (should_decrement) {
+                decrement_value = current_value - 1;
+            } else {
+                decrement_value = current_value;
+            }
+        } while (
+            !monitor.ExclusiveWrite32(current_core, address, static_cast<u32>(decrement_value)));
+
+        // Short-circuit without rescheduling, if timeout is zero.
+        if (timeout == 0) {
+            lock.CancelSleep();
+            return RESULT_TIMEOUT;
+        }
 
-    const s32 cur_value = static_cast<s32>(memory.Read32(address));
-    if (cur_value >= value) {
-        return ERR_INVALID_STATE;
+        current_thread->SetArbiterWaitAddress(address);
+        InsertThread(SharedFrom(current_thread));
+        current_thread->SetStatus(ThreadStatus::WaitArb);
+        current_thread->WaitForArbitration(true);
     }
 
-    if (should_decrement) {
-        memory.Write32(address, static_cast<u32>(cur_value - 1));
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
     }
 
-    // Short-circuit without rescheduling, if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
+    {
+        SchedulerLock lock(kernel);
+        if (current_thread->IsWaitingForArbitration()) {
+            RemoveThread(SharedFrom(current_thread));
+            current_thread->WaitForArbitration(false);
+        }
     }
 
-    return WaitForAddressImpl(address, timeout);
+    return current_thread->GetSignalingResult();
 }
 
 ResultCode AddressArbiter::WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
     auto& memory = system.Memory();
+    auto& kernel = system.Kernel();
+    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
 
-    // Ensure that we can read the address.
-    if (!memory.IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
+    Handle event_handle = InvalidHandle;
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, current_thread, timeout);
+
+        if (current_thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return ERR_THREAD_TERMINATING;
+        }
+
+        // Ensure that we can read the address.
+        if (!memory.IsValidVirtualAddress(address)) {
+            lock.CancelSleep();
+            return ERR_INVALID_ADDRESS_STATE;
+        }
 
-    // Only wait for the address if equal.
-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
+        s32 current_value = static_cast<s32>(memory.Read32(address));
+        if (current_value != value) {
+            lock.CancelSleep();
+            return ERR_INVALID_STATE;
+        }
+
+        // Short-circuit without rescheduling, if timeout is zero.
+        if (timeout == 0) {
+            lock.CancelSleep();
+            return RESULT_TIMEOUT;
+        }
+
+        current_thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        current_thread->SetArbiterWaitAddress(address);
+        InsertThread(SharedFrom(current_thread));
+        current_thread->SetStatus(ThreadStatus::WaitArb);
+        current_thread->WaitForArbitration(true);
     }
 
-    // Short-circuit without rescheduling if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
     }
 
-    return WaitForAddressImpl(address, timeout);
-}
+    {
+        SchedulerLock lock(kernel);
+        if (current_thread->IsWaitingForArbitration()) {
+            RemoveThread(SharedFrom(current_thread));
+            current_thread->WaitForArbitration(false);
+        }
+    }
 
-ResultCode AddressArbiter::WaitForAddressImpl(VAddr address, s64 timeout) {
-    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
-    current_thread->SetArbiterWaitAddress(address);
-    InsertThread(SharedFrom(current_thread));
-    current_thread->SetStatus(ThreadStatus::WaitArb);
-    current_thread->InvalidateWakeupCallback();
-    current_thread->WakeAfterDelay(timeout);
-
-    system.PrepareReschedule(current_thread->GetProcessorID());
-    return RESULT_TIMEOUT;
+    return current_thread->GetSignalingResult();
 }
 
 void AddressArbiter::HandleWakeupThread(std::shared_ptr<Thread> thread) {
@@ -221,9 +305,9 @@ void AddressArbiter::RemoveThread(std::shared_ptr<Thread> thread) {
     const auto iter = std::find_if(thread_list.cbegin(), thread_list.cend(),
                                    [&thread](const auto& entry) { return thread == entry; });
 
-    ASSERT(iter != thread_list.cend());
-
-    thread_list.erase(iter);
+    if (iter != thread_list.cend()) {
+        thread_list.erase(iter);
+    }
 }
 
 std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(
diff --git a/src/core/hle/kernel/address_arbiter.h b/src/core/hle/kernel/address_arbiter.h
index f958eee5a..0b05d533c 100644
--- a/src/core/hle/kernel/address_arbiter.h
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -73,9 +73,6 @@ private:
     /// Waits on an address if the value passed is equal to the argument value.
     ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
 
-    // Waits on the given address with a timeout in nanoseconds
-    ResultCode WaitForAddressImpl(VAddr address, s64 timeout);
-
     /// Wake up num_to_wake (or all) threads in a vector.
     void WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads, s32 num_to_wake);
 
diff --git a/src/core/hle/kernel/client_port.cpp b/src/core/hle/kernel/client_port.cpp
index 5498fd313..8aff2227a 100644
--- a/src/core/hle/kernel/client_port.cpp
+++ b/src/core/hle/kernel/client_port.cpp
@@ -34,7 +34,7 @@ ResultVal<std::shared_ptr<ClientSession>> ClientPort::Connect() {
     }
 
     // Wake the threads waiting on the ServerPort
-    server_port->WakeupAllWaitingThreads();
+    server_port->Signal();
 
     return MakeResult(std::move(client));
 }
diff --git a/src/core/hle/kernel/errors.h b/src/core/hle/kernel/errors.h
index 29bfa3621..d4e5d88cf 100644
--- a/src/core/hle/kernel/errors.h
+++ b/src/core/hle/kernel/errors.h
@@ -12,6 +12,7 @@ namespace Kernel {
 
 constexpr ResultCode ERR_MAX_CONNECTIONS_REACHED{ErrorModule::Kernel, 7};
 constexpr ResultCode ERR_INVALID_CAPABILITY_DESCRIPTOR{ErrorModule::Kernel, 14};
+constexpr ResultCode ERR_THREAD_TERMINATING{ErrorModule::Kernel, 59};
 constexpr ResultCode ERR_INVALID_SIZE{ErrorModule::Kernel, 101};
 constexpr ResultCode ERR_INVALID_ADDRESS{ErrorModule::Kernel, 102};
 constexpr ResultCode ERR_OUT_OF_RESOURCES{ErrorModule::Kernel, 103};
diff --git a/src/core/hle/kernel/hle_ipc.cpp b/src/core/hle/kernel/hle_ipc.cpp
index ba0eac4c2..9277b5d08 100644
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -14,14 +14,17 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/server_session.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/memory.h"
 
@@ -46,15 +49,6 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
     const std::string& reason, u64 timeout, WakeupCallback&& callback,
     std::shared_ptr<WritableEvent> writable_event) {
     // Put the client thread to sleep until the wait event is signaled or the timeout expires.
-    thread->SetWakeupCallback(
-        [context = *this, callback](ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                    std::shared_ptr<SynchronizationObject> object,
-                                    std::size_t index) mutable -> bool {
-            ASSERT(thread->GetStatus() == ThreadStatus::WaitHLEEvent);
-            callback(thread, context, reason);
-            context.WriteToOutgoingCommandBuffer(*thread);
-            return true;
-        });
 
     if (!writable_event) {
         // Create event if not provided
@@ -62,14 +56,26 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
         writable_event = pair.writable;
     }
 
-    const auto readable_event{writable_event->GetReadableEvent()};
-    writable_event->Clear();
-    thread->SetStatus(ThreadStatus::WaitHLEEvent);
-    thread->SetSynchronizationObjects({readable_event});
-    readable_event->AddWaitingThread(thread);
-
-    if (timeout > 0) {
-        thread->WakeAfterDelay(timeout);
+    {
+        Handle event_handle = InvalidHandle;
+        SchedulerLockAndSleep lock(kernel, event_handle, thread.get(), timeout);
+        thread->SetHLECallback(
+            [context = *this, callback](std::shared_ptr<Thread> thread) mutable -> bool {
+                ThreadWakeupReason reason = thread->GetSignalingResult() == RESULT_TIMEOUT
+                                                ? ThreadWakeupReason::Timeout
+                                                : ThreadWakeupReason::Signal;
+                callback(thread, context, reason);
+                context.WriteToOutgoingCommandBuffer(*thread);
+                return true;
+            });
+        const auto readable_event{writable_event->GetReadableEvent()};
+        writable_event->Clear();
+        thread->SetHLESyncObject(readable_event.get());
+        thread->SetStatus(ThreadStatus::WaitHLEEvent);
+        thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        readable_event->AddWaitingThread(thread);
+        lock.Release();
+        thread->SetHLETimeEvent(event_handle);
     }
 
     is_thread_waiting = true;
@@ -282,18 +288,18 @@ ResultCode HLERequestContext::WriteToOutgoingCommandBuffer(Thread& thread) {
 }
 
 std::vector<u8> HLERequestContext::ReadBuffer(std::size_t buffer_index) const {
-    std::vector<u8> buffer;
+    std::vector<u8> buffer{};
     const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
                            BufferDescriptorA()[buffer_index].Size()};
 
     if (is_buffer_a) {
-        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
-                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorA().size() > buffer_index, { return buffer; },
+                              "BufferDescriptorA invalid buffer_index {}", buffer_index);
         buffer.resize(BufferDescriptorA()[buffer_index].Size());
         memory.ReadBlock(BufferDescriptorA()[buffer_index].Address(), buffer.data(), buffer.size());
     } else {
-        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
-                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorX().size() > buffer_index, { return buffer; },
+                              "BufferDescriptorX invalid buffer_index {}", buffer_index);
         buffer.resize(BufferDescriptorX()[buffer_index].Size());
         memory.ReadBlock(BufferDescriptorX()[buffer_index].Address(), buffer.data(), buffer.size());
     }
@@ -318,16 +324,16 @@ std::size_t HLERequestContext::WriteBuffer(const void* buffer, std::size_t size,
     }
 
     if (is_buffer_b) {
-        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
-                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorB()[buffer_index].Size() >= size,
-                   "BufferDescriptorB buffer_index {} is not large enough", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorB().size() > buffer_index &&
+                                  BufferDescriptorB()[buffer_index].Size() >= size,
+                              { return 0; }, "BufferDescriptorB is invalid, index={}, size={}",
+                              buffer_index, size);
         memory.WriteBlock(BufferDescriptorB()[buffer_index].Address(), buffer, size);
     } else {
-        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
-                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorC()[buffer_index].Size() >= size,
-                   "BufferDescriptorC buffer_index {} is not large enough", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorC().size() > buffer_index &&
+                                  BufferDescriptorC()[buffer_index].Size() >= size,
+                              { return 0; }, "BufferDescriptorC is invalid, index={}, size={}",
+                              buffer_index, size);
         memory.WriteBlock(BufferDescriptorC()[buffer_index].Address(), buffer, size);
     }
 
@@ -338,16 +344,12 @@ std::size_t HLERequestContext::GetReadBufferSize(std::size_t buffer_index) const
     const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
                            BufferDescriptorA()[buffer_index].Size()};
     if (is_buffer_a) {
-        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
-                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorA()[buffer_index].Size() > 0,
-                   "BufferDescriptorA buffer_index {} is empty", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorA().size() > buffer_index, { return 0; },
+                              "BufferDescriptorA invalid buffer_index {}", buffer_index);
         return BufferDescriptorA()[buffer_index].Size();
     } else {
-        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
-                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorX()[buffer_index].Size() > 0,
-                   "BufferDescriptorX buffer_index {} is empty", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorX().size() > buffer_index, { return 0; },
+                              "BufferDescriptorX invalid buffer_index {}", buffer_index);
         return BufferDescriptorX()[buffer_index].Size();
     }
 }
@@ -356,14 +358,15 @@ std::size_t HLERequestContext::GetWriteBufferSize(std::size_t buffer_index) cons
     const bool is_buffer_b{BufferDescriptorB().size() > buffer_index &&
                            BufferDescriptorB()[buffer_index].Size()};
     if (is_buffer_b) {
-        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
-                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorB().size() > buffer_index, { return 0; },
+                              "BufferDescriptorB invalid buffer_index {}", buffer_index);
         return BufferDescriptorB()[buffer_index].Size();
     } else {
-        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
-                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorC().size() > buffer_index, { return 0; },
+                              "BufferDescriptorC invalid buffer_index {}", buffer_index);
         return BufferDescriptorC()[buffer_index].Size();
     }
+    return 0;
 }
 
 std::string HLERequestContext::Description() const {
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 7655382fa..1f2af7a1b 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
 #include <atomic>
 #include <bitset>
 #include <functional>
@@ -13,11 +14,15 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/arm/arm_interface.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/device_memory.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/client_port.h"
@@ -39,85 +44,28 @@
 #include "core/hle/result.h"
 #include "core/memory.h"
 
-namespace Kernel {
-
-/**
- * Callback that will wake up the thread it was scheduled for
- * @param thread_handle The handle of the thread that's been awoken
- * @param cycles_late The number of CPU cycles that have passed since the desired wakeup time
- */
-static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_late) {
-    const auto proper_handle = static_cast<Handle>(thread_handle);
-    const auto& system = Core::System::GetInstance();
-
-    // Lock the global kernel mutex when we enter the kernel HLE.
-    std::lock_guard lock{HLE::g_hle_lock};
-
-    std::shared_ptr<Thread> thread =
-        system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
-    if (thread == nullptr) {
-        LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle);
-        return;
-    }
-
-    bool resume = true;
-
-    if (thread->GetStatus() == ThreadStatus::WaitSynch ||
-        thread->GetStatus() == ThreadStatus::WaitHLEEvent) {
-        // Remove the thread from each of its waiting objects' waitlists
-        for (const auto& object : thread->GetSynchronizationObjects()) {
-            object->RemoveWaitingThread(thread);
-        }
-        thread->ClearSynchronizationObjects();
-
-        // Invoke the wakeup callback before clearing the wait objects
-        if (thread->HasWakeupCallback()) {
-            resume = thread->InvokeWakeupCallback(ThreadWakeupReason::Timeout, thread, nullptr, 0);
-        }
-    } else if (thread->GetStatus() == ThreadStatus::WaitMutex ||
-               thread->GetStatus() == ThreadStatus::WaitCondVar) {
-        thread->SetMutexWaitAddress(0);
-        thread->SetWaitHandle(0);
-        if (thread->GetStatus() == ThreadStatus::WaitCondVar) {
-            thread->GetOwnerProcess()->RemoveConditionVariableThread(thread);
-            thread->SetCondVarWaitAddress(0);
-        }
-
-        auto* const lock_owner = thread->GetLockOwner();
-        // Threads waking up by timeout from WaitProcessWideKey do not perform priority inheritance
-        // and don't have a lock owner unless SignalProcessWideKey was called first and the thread
-        // wasn't awakened due to the mutex already being acquired.
-        if (lock_owner != nullptr) {
-            lock_owner->RemoveMutexWaiter(thread);
-        }
-    }
+MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
 
-    if (thread->GetStatus() == ThreadStatus::WaitArb) {
-        auto& address_arbiter = thread->GetOwnerProcess()->GetAddressArbiter();
-        address_arbiter.HandleWakeupThread(thread);
-    }
-
-    if (resume) {
-        if (thread->GetStatus() == ThreadStatus::WaitCondVar ||
-            thread->GetStatus() == ThreadStatus::WaitArb) {
-            thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
-        }
-        thread->ResumeFromWait();
-    }
-}
+namespace Kernel {
 
 struct KernelCore::Impl {
     explicit Impl(Core::System& system, KernelCore& kernel)
         : global_scheduler{kernel}, synchronization{system}, time_manager{system}, system{system} {}
 
+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
     void Initialize(KernelCore& kernel) {
         Shutdown();
+        RegisterHostThread();
 
         InitializePhysicalCores();
         InitializeSystemResourceLimit(kernel);
         InitializeMemoryLayout();
-        InitializeThreads();
-        InitializePreemption();
+        InitializePreemption(kernel);
+        InitializeSchedulers();
+        InitializeSuspendThreads();
     }
 
     void Shutdown() {
@@ -126,13 +74,26 @@ struct KernelCore::Impl {
         next_user_process_id = Process::ProcessIDMin;
         next_thread_id = 1;
 
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            if (suspend_threads[i]) {
+                suspend_threads[i].reset();
+            }
+        }
+
+        for (std::size_t i = 0; i < cores.size(); i++) {
+            cores[i].Shutdown();
+            schedulers[i].reset();
+        }
+        cores.clear();
+
+        registered_core_threads.reset();
+
         process_list.clear();
         current_process = nullptr;
 
         system_resource_limit = nullptr;
 
         global_handle_table.Clear();
-        thread_wakeup_event_type = nullptr;
         preemption_event = nullptr;
 
         global_scheduler.Shutdown();
@@ -145,13 +106,21 @@ struct KernelCore::Impl {
         cores.clear();
 
         exclusive_monitor.reset();
+        host_thread_ids.clear();
     }
 
     void InitializePhysicalCores() {
         exclusive_monitor =
             Core::MakeExclusiveMonitor(system.Memory(), Core::Hardware::NUM_CPU_CORES);
         for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
-            cores.emplace_back(system, i, *exclusive_monitor);
+            schedulers[i] = std::make_unique<Kernel::Scheduler>(system, i);
+            cores.emplace_back(system, i, *schedulers[i], interrupts[i]);
+        }
+    }
+
+    void InitializeSchedulers() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            cores[i].Scheduler().Initialize();
         }
     }
 
@@ -173,15 +142,13 @@ struct KernelCore::Impl {
         }
     }
 
-    void InitializeThreads() {
-        thread_wakeup_event_type =
-            Core::Timing::CreateEvent("ThreadWakeupCallback", ThreadWakeupCallback);
-    }
-
-    void InitializePreemption() {
-        preemption_event =
-            Core::Timing::CreateEvent("PreemptionCallback", [this](u64 userdata, s64 cycles_late) {
-                global_scheduler.PreemptThreads();
+    void InitializePreemption(KernelCore& kernel) {
+        preemption_event = Core::Timing::CreateEvent(
+            "PreemptionCallback", [this, &kernel](u64 userdata, s64 cycles_late) {
+                {
+                    SchedulerLock lock(kernel);
+                    global_scheduler.PreemptThreads();
+                }
                 s64 time_interval = Core::Timing::msToCycles(std::chrono::milliseconds(10));
                 system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
             });
@@ -190,6 +157,20 @@ struct KernelCore::Impl {
         system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
     }
 
+    void InitializeSuspendThreads() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            std::string name = "Suspend Thread Id:" + std::to_string(i);
+            std::function<void(void*)> init_func =
+                system.GetCpuManager().GetSuspendThreadStartFunc();
+            void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+            ThreadType type =
+                static_cast<ThreadType>(THREADTYPE_KERNEL | THREADTYPE_HLE | THREADTYPE_SUSPEND);
+            auto thread_res = Thread::Create(system, type, name, 0, 0, 0, static_cast<u32>(i), 0,
+                                             nullptr, std::move(init_func), init_func_parameter);
+            suspend_threads[i] = std::move(thread_res).Unwrap();
+        }
+    }
+
     void MakeCurrentProcess(Process* process) {
         current_process = process;
 
@@ -197,15 +178,17 @@ struct KernelCore::Impl {
             return;
         }
 
-        for (auto& core : cores) {
-            core.SetIs64Bit(process->Is64BitProcess());
+        u32 core_id = GetCurrentHostThreadID();
+        if (core_id < Core::Hardware::NUM_CPU_CORES) {
+            system.Memory().SetCurrentPageTable(*process, core_id);
         }
-
-        system.Memory().SetCurrentPageTable(*process);
     }
 
     void RegisterCoreThread(std::size_t core_id) {
         std::unique_lock lock{register_thread_mutex};
+        if (!is_multicore) {
+            single_core_thread_id = std::this_thread::get_id();
+        }
         const std::thread::id this_id = std::this_thread::get_id();
         const auto it = host_thread_ids.find(this_id);
         ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
@@ -219,12 +202,19 @@ struct KernelCore::Impl {
         std::unique_lock lock{register_thread_mutex};
         const std::thread::id this_id = std::this_thread::get_id();
         const auto it = host_thread_ids.find(this_id);
-        ASSERT(it == host_thread_ids.end());
+        if (it != host_thread_ids.end()) {
+            return;
+        }
         host_thread_ids[this_id] = registered_thread_ids++;
     }
 
     u32 GetCurrentHostThreadID() const {
         const std::thread::id this_id = std::this_thread::get_id();
+        if (!is_multicore) {
+            if (single_core_thread_id == this_id) {
+                return static_cast<u32>(system.GetCpuManager().CurrentCore());
+            }
+        }
         const auto it = host_thread_ids.find(this_id);
         if (it == host_thread_ids.end()) {
             return Core::INVALID_HOST_THREAD_ID;
@@ -240,7 +230,7 @@ struct KernelCore::Impl {
         }
         const Kernel::Scheduler& sched = cores[result.host_handle].Scheduler();
         const Kernel::Thread* current = sched.GetCurrentThread();
-        if (current != nullptr) {
+        if (current != nullptr && !current->IsPhantomMode()) {
             result.guest_handle = current->GetGlobalHandle();
         } else {
             result.guest_handle = InvalidHandle;
@@ -313,7 +303,6 @@ struct KernelCore::Impl {
 
     std::shared_ptr<ResourceLimit> system_resource_limit;
 
-    std::shared_ptr<Core::Timing::EventType> thread_wakeup_event_type;
     std::shared_ptr<Core::Timing::EventType> preemption_event;
 
     // This is the kernel's handle table or supervisor handle table which
@@ -343,6 +332,15 @@ struct KernelCore::Impl {
     std::shared_ptr<Kernel::SharedMemory> irs_shared_mem;
     std::shared_ptr<Kernel::SharedMemory> time_shared_mem;
 
+    std::array<std::shared_ptr<Thread>, Core::Hardware::NUM_CPU_CORES> suspend_threads{};
+    std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES> interrupts{};
+    std::array<std::unique_ptr<Kernel::Scheduler>, Core::Hardware::NUM_CPU_CORES> schedulers{};
+
+    bool is_multicore{};
+    std::thread::id single_core_thread_id{};
+
+    std::array<u64, Core::Hardware::NUM_CPU_CORES> svc_ticks{};
+
     // System context
     Core::System& system;
 };
@@ -352,6 +350,10 @@ KernelCore::~KernelCore() {
     Shutdown();
 }
 
+void KernelCore::SetMulticore(bool is_multicore) {
+    impl->SetMulticore(is_multicore);
+}
+
 void KernelCore::Initialize() {
     impl->Initialize(*this);
 }
@@ -397,11 +399,11 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
 }
 
 Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) {
-    return impl->cores[id].Scheduler();
+    return *impl->schedulers[id];
 }
 
 const Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) const {
-    return impl->cores[id].Scheduler();
+    return *impl->schedulers[id];
 }
 
 Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) {
@@ -412,6 +414,39 @@ const Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) const {
     return impl->cores[id];
 }
 
+Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+const Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() const {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+Kernel::Scheduler& KernelCore::CurrentScheduler() {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return *impl->schedulers[core_id];
+}
+
+const Kernel::Scheduler& KernelCore::CurrentScheduler() const {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return *impl->schedulers[core_id];
+}
+
+std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& KernelCore::Interrupts() {
+    return impl->interrupts;
+}
+
+const std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& KernelCore::Interrupts()
+    const {
+    return impl->interrupts;
+}
+
 Kernel::Synchronization& KernelCore::Synchronization() {
     return impl->synchronization;
 }
@@ -437,15 +472,17 @@ const Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() const {
 }
 
 void KernelCore::InvalidateAllInstructionCaches() {
-    for (std::size_t i = 0; i < impl->global_scheduler.CpuCoresCount(); i++) {
-        PhysicalCore(i).ArmInterface().ClearInstructionCache();
+    auto& threads = GlobalScheduler().GetThreadList();
+    for (auto& thread : threads) {
+        if (!thread->IsHLEThread()) {
+            auto& arm_interface = thread->ArmInterface();
+            arm_interface.ClearInstructionCache();
+        }
     }
 }
 
 void KernelCore::PrepareReschedule(std::size_t id) {
-    if (id < impl->global_scheduler.CpuCoresCount()) {
-        impl->cores[id].Stop();
-    }
+    // TODO: Reimplement, this
 }
 
 void KernelCore::AddNamedPort(std::string name, std::shared_ptr<ClientPort> port) {
@@ -481,10 +518,6 @@ u64 KernelCore::CreateNewUserProcessID() {
     return impl->next_user_process_id++;
 }
 
-const std::shared_ptr<Core::Timing::EventType>& KernelCore::ThreadWakeupCallbackEventType() const {
-    return impl->thread_wakeup_event_type;
-}
-
 Kernel::HandleTable& KernelCore::GlobalHandleTable() {
     return impl->global_handle_table;
 }
@@ -557,4 +590,34 @@ const Kernel::SharedMemory& KernelCore::GetTimeSharedMem() const {
     return *impl->time_shared_mem;
 }
 
+void KernelCore::Suspend(bool in_suspention) {
+    const bool should_suspend = exception_exited || in_suspention;
+    {
+        SchedulerLock lock(*this);
+        ThreadStatus status = should_suspend ? ThreadStatus::Ready : ThreadStatus::WaitSleep;
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            impl->suspend_threads[i]->SetStatus(status);
+        }
+    }
+}
+
+bool KernelCore::IsMulticore() const {
+    return impl->is_multicore;
+}
+
+void KernelCore::ExceptionalExit() {
+    exception_exited = true;
+    Suspend(true);
+}
+
+void KernelCore::EnterSVCProfile() {
+    std::size_t core = impl->GetCurrentHostThreadID();
+    impl->svc_ticks[core] = MicroProfileEnter(MICROPROFILE_TOKEN(Kernel_SVC));
+}
+
+void KernelCore::ExitSVCProfile() {
+    std::size_t core = impl->GetCurrentHostThreadID();
+    MicroProfileLeave(MICROPROFILE_TOKEN(Kernel_SVC), impl->svc_ticks[core]);
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h
index 83de1f542..49bd47e89 100644
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -4,15 +4,17 @@
 
 #pragma once
 
+#include <array>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/memory/memory_types.h"
 #include "core/hle/kernel/object.h"
 
 namespace Core {
-struct EmuThreadHandle;
+class CPUInterruptHandler;
 class ExclusiveMonitor;
 class System;
 } // namespace Core
@@ -65,6 +67,9 @@ public:
     KernelCore(KernelCore&&) = delete;
     KernelCore& operator=(KernelCore&&) = delete;
 
+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore);
+
     /// Resets the kernel to a clean slate for use.
     void Initialize();
 
@@ -110,6 +115,18 @@ public:
     /// Gets the an instance of the respective physical CPU core.
     const Kernel::PhysicalCore& PhysicalCore(std::size_t id) const;
 
+    /// Gets the sole instance of the Scheduler at the current running core.
+    Kernel::Scheduler& CurrentScheduler();
+
+    /// Gets the sole instance of the Scheduler at the current running core.
+    const Kernel::Scheduler& CurrentScheduler() const;
+
+    /// Gets the an instance of the current physical CPU core.
+    Kernel::PhysicalCore& CurrentPhysicalCore();
+
+    /// Gets the an instance of the current physical CPU core.
+    const Kernel::PhysicalCore& CurrentPhysicalCore() const;
+
     /// Gets the an instance of the Synchronization Interface.
     Kernel::Synchronization& Synchronization();
 
@@ -129,6 +146,10 @@ public:
 
     const Core::ExclusiveMonitor& GetExclusiveMonitor() const;
 
+    std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& Interrupts();
+
+    const std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& Interrupts() const;
+
     void InvalidateAllInstructionCaches();
 
     /// Adds a port to the named port table
@@ -191,6 +212,18 @@ public:
     /// Gets the shared memory object for Time services.
     const Kernel::SharedMemory& GetTimeSharedMem() const;
 
+    /// Suspend/unsuspend the OS.
+    void Suspend(bool in_suspention);
+
+    /// Exceptional exit the OS.
+    void ExceptionalExit();
+
+    bool IsMulticore() const;
+
+    void EnterSVCProfile();
+
+    void ExitSVCProfile();
+
 private:
     friend class Object;
     friend class Process;
@@ -208,9 +241,6 @@ private:
     /// Creates a new thread ID, incrementing the internal thread ID counter.
     u64 CreateNewThreadID();
 
-    /// Retrieves the event type used for thread wakeup callbacks.
-    const std::shared_ptr<Core::Timing::EventType>& ThreadWakeupCallbackEventType() const;
-
     /// Provides a reference to the global handle table.
     Kernel::HandleTable& GlobalHandleTable();
 
@@ -219,6 +249,7 @@ private:
 
     struct Impl;
     std::unique_ptr<Impl> impl;
+    bool exception_exited{};
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/memory/memory_manager.cpp b/src/core/hle/kernel/memory/memory_manager.cpp
index 6b432e1b2..acf13585c 100644
--- a/src/core/hle/kernel/memory/memory_manager.cpp
+++ b/src/core/hle/kernel/memory/memory_manager.cpp
@@ -104,7 +104,7 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
     // Ensure that we don't leave anything un-freed
     auto group_guard = detail::ScopeExit([&] {
         for (const auto& it : page_list.Nodes()) {
-            const auto min_num_pages{std::min(
+            const auto min_num_pages{std::min<size_t>(
                 it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
             chosen_manager.Free(it.GetAddress(), min_num_pages);
         }
@@ -139,7 +139,6 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
     }
 
     // Only succeed if we allocated as many pages as we wanted
-    ASSERT(num_pages >= 0);
     if (num_pages) {
         return ERR_OUT_OF_MEMORY;
     }
@@ -165,7 +164,7 @@ ResultCode MemoryManager::Free(PageLinkedList& page_list, std::size_t num_pages,
 
     // Free all of the pages
     for (const auto& it : page_list.Nodes()) {
-        const auto min_num_pages{std::min(
+        const auto min_num_pages{std::min<size_t>(
             it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
         chosen_manager.Free(it.GetAddress(), min_num_pages);
     }
diff --git a/src/core/hle/kernel/mutex.cpp b/src/core/hle/kernel/mutex.cpp
index 7869eb32b..8f6c944d1 100644
--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -34,8 +34,6 @@ static std::pair<std::shared_ptr<Thread>, u32> GetHighestPriorityMutexWaitingThr
         if (thread->GetMutexWaitAddress() != mutex_addr)
             continue;
 
-        ASSERT(thread->GetStatus() == ThreadStatus::WaitMutex);
-
         ++num_waiters;
         if (highest_priority_thread == nullptr ||
             thread->GetPriority() < highest_priority_thread->GetPriority()) {
@@ -49,6 +47,7 @@ static std::pair<std::shared_ptr<Thread>, u32> GetHighestPriorityMutexWaitingThr
 /// Update the mutex owner field of all threads waiting on the mutex to point to the new owner.
 static void TransferMutexOwnership(VAddr mutex_addr, std::shared_ptr<Thread> current_thread,
                                    std::shared_ptr<Thread> new_owner) {
+    current_thread->RemoveMutexWaiter(new_owner);
     const auto threads = current_thread->GetMutexWaitingThreads();
     for (const auto& thread : threads) {
         if (thread->GetMutexWaitAddress() != mutex_addr)
@@ -72,85 +71,100 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
         return ERR_INVALID_ADDRESS;
     }
 
-    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
+    auto& kernel = system.Kernel();
     std::shared_ptr<Thread> current_thread =
-        SharedFrom(system.CurrentScheduler().GetCurrentThread());
-    std::shared_ptr<Thread> holding_thread = handle_table.Get<Thread>(holding_thread_handle);
-    std::shared_ptr<Thread> requesting_thread = handle_table.Get<Thread>(requesting_thread_handle);
+        SharedFrom(kernel.CurrentScheduler().GetCurrentThread());
+    {
+        SchedulerLock lock(kernel);
+        // The mutex address must be 4-byte aligned
+        if ((address % sizeof(u32)) != 0) {
+            return ERR_INVALID_ADDRESS;
+        }
 
-    // TODO(Subv): It is currently unknown if it is possible to lock a mutex in behalf of another
-    // thread.
-    ASSERT(requesting_thread == current_thread);
+        const auto& handle_table = kernel.CurrentProcess()->GetHandleTable();
+        std::shared_ptr<Thread> holding_thread = handle_table.Get<Thread>(holding_thread_handle);
+        std::shared_ptr<Thread> requesting_thread =
+            handle_table.Get<Thread>(requesting_thread_handle);
 
-    const u32 addr_value = system.Memory().Read32(address);
+        // TODO(Subv): It is currently unknown if it is possible to lock a mutex in behalf of
+        // another thread.
+        ASSERT(requesting_thread == current_thread);
 
-    // If the mutex isn't being held, just return success.
-    if (addr_value != (holding_thread_handle | Mutex::MutexHasWaitersFlag)) {
-        return RESULT_SUCCESS;
-    }
+        current_thread->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
 
-    if (holding_thread == nullptr) {
-        LOG_ERROR(Kernel, "Holding thread does not exist! thread_handle={:08X}",
-                  holding_thread_handle);
-        return ERR_INVALID_HANDLE;
-    }
+        const u32 addr_value = system.Memory().Read32(address);
+
+        // If the mutex isn't being held, just return success.
+        if (addr_value != (holding_thread_handle | Mutex::MutexHasWaitersFlag)) {
+            return RESULT_SUCCESS;
+        }
 
-    // Wait until the mutex is released
-    current_thread->SetMutexWaitAddress(address);
-    current_thread->SetWaitHandle(requesting_thread_handle);
+        if (holding_thread == nullptr) {
+            return ERR_INVALID_HANDLE;
+        }
 
-    current_thread->SetStatus(ThreadStatus::WaitMutex);
-    current_thread->InvalidateWakeupCallback();
+        // Wait until the mutex is released
+        current_thread->SetMutexWaitAddress(address);
+        current_thread->SetWaitHandle(requesting_thread_handle);
 
-    // Update the lock holder thread's priority to prevent priority inversion.
-    holding_thread->AddMutexWaiter(current_thread);
+        current_thread->SetStatus(ThreadStatus::WaitMutex);
 
-    system.PrepareReschedule();
+        // Update the lock holder thread's priority to prevent priority inversion.
+        holding_thread->AddMutexWaiter(current_thread);
+    }
 
-    return RESULT_SUCCESS;
+    {
+        SchedulerLock lock(kernel);
+        auto* owner = current_thread->GetLockOwner();
+        if (owner != nullptr) {
+            owner->RemoveMutexWaiter(current_thread);
+        }
+    }
+    return current_thread->GetSignalingResult();
 }
 
-ResultCode Mutex::Release(VAddr address) {
+std::pair<ResultCode, std::shared_ptr<Thread>> Mutex::Unlock(std::shared_ptr<Thread> owner,
+                                                             VAddr address) {
     // The mutex address must be 4-byte aligned
     if ((address % sizeof(u32)) != 0) {
         LOG_ERROR(Kernel, "Address is not 4-byte aligned! address={:016X}", address);
-        return ERR_INVALID_ADDRESS;
+        return {ERR_INVALID_ADDRESS, nullptr};
     }
 
-    std::shared_ptr<Thread> current_thread =
-        SharedFrom(system.CurrentScheduler().GetCurrentThread());
-    auto [thread, num_waiters] = GetHighestPriorityMutexWaitingThread(current_thread, address);
-
-    // There are no more threads waiting for the mutex, release it completely.
-    if (thread == nullptr) {
+    auto [new_owner, num_waiters] = GetHighestPriorityMutexWaitingThread(owner, address);
+    if (new_owner == nullptr) {
         system.Memory().Write32(address, 0);
-        return RESULT_SUCCESS;
+        return {RESULT_SUCCESS, nullptr};
     }
-
     // Transfer the ownership of the mutex from the previous owner to the new one.
-    TransferMutexOwnership(address, current_thread, thread);
-
-    u32 mutex_value = thread->GetWaitHandle();
-
+    TransferMutexOwnership(address, owner, new_owner);
+    u32 mutex_value = new_owner->GetWaitHandle();
     if (num_waiters >= 2) {
         // Notify the guest that there are still some threads waiting for the mutex
         mutex_value |= Mutex::MutexHasWaitersFlag;
     }
+    new_owner->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
+    new_owner->SetLockOwner(nullptr);
+    new_owner->ResumeFromWait();
 
-    // Grant the mutex to the next waiting thread and resume it.
     system.Memory().Write32(address, mutex_value);
+    return {RESULT_SUCCESS, new_owner};
+}
 
-    ASSERT(thread->GetStatus() == ThreadStatus::WaitMutex);
-    thread->ResumeFromWait();
+ResultCode Mutex::Release(VAddr address) {
+    auto& kernel = system.Kernel();
+    SchedulerLock lock(kernel);
 
-    thread->SetLockOwner(nullptr);
-    thread->SetCondVarWaitAddress(0);
-    thread->SetMutexWaitAddress(0);
-    thread->SetWaitHandle(0);
-    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+    std::shared_ptr<Thread> current_thread =
+        SharedFrom(kernel.CurrentScheduler().GetCurrentThread());
 
-    system.PrepareReschedule();
+    auto [result, new_owner] = Unlock(current_thread, address);
 
-    return RESULT_SUCCESS;
+    if (result != RESULT_SUCCESS && new_owner != nullptr) {
+        new_owner->SetSynchronizationResults(nullptr, result);
+    }
+
+    return result;
 }
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/mutex.h b/src/core/hle/kernel/mutex.h
index b904de2e8..3b81dc3df 100644
--- a/src/core/hle/kernel/mutex.h
+++ b/src/core/hle/kernel/mutex.h
@@ -28,6 +28,10 @@ public:
     ResultCode TryAcquire(VAddr address, Handle holding_thread_handle,
                           Handle requesting_thread_handle);
 
+    /// Unlocks a mutex for owner at address
+    std::pair<ResultCode, std::shared_ptr<Thread>> Unlock(std::shared_ptr<Thread> owner,
+                                                          VAddr address);
+
     /// Releases the mutex at the specified address.
     ResultCode Release(VAddr address);
 
diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp
index a15011076..c6bbdb080 100644
--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@@ -2,12 +2,15 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/spin_lock.h"
 #include "core/arm/arm_interface.h"
 #ifdef ARCHITECTURE_x86_64
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
@@ -17,50 +20,37 @@
 
 namespace Kernel {
 
-PhysicalCore::PhysicalCore(Core::System& system, std::size_t id,
-                           Core::ExclusiveMonitor& exclusive_monitor)
-    : core_index{id} {
-#ifdef ARCHITECTURE_x86_64
-    arm_interface_32 =
-        std::make_unique<Core::ARM_Dynarmic_32>(system, exclusive_monitor, core_index);
-    arm_interface_64 =
-        std::make_unique<Core::ARM_Dynarmic_64>(system, exclusive_monitor, core_index);
-
-#else
-    using Core::ARM_Unicorn;
-    arm_interface_32 = std::make_unique<ARM_Unicorn>(system, ARM_Unicorn::Arch::AArch32);
-    arm_interface_64 = std::make_unique<ARM_Unicorn>(system, ARM_Unicorn::Arch::AArch64);
-    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
-#endif
+PhysicalCore::PhysicalCore(Core::System& system, std::size_t id, Kernel::Scheduler& scheduler,
+                           Core::CPUInterruptHandler& interrupt_handler)
+    : interrupt_handler{interrupt_handler}, core_index{id}, scheduler{scheduler} {
 
-    scheduler = std::make_unique<Kernel::Scheduler>(system, core_index);
+    guard = std::make_unique<Common::SpinLock>();
 }
 
 PhysicalCore::~PhysicalCore() = default;
 
-void PhysicalCore::Run() {
-    arm_interface->Run();
-    arm_interface->ClearExclusiveState();
+void PhysicalCore::Idle() {
+    interrupt_handler.AwaitInterrupt();
 }
 
-void PhysicalCore::Step() {
-    arm_interface->Step();
+void PhysicalCore::Shutdown() {
+    scheduler.Shutdown();
 }
 
-void PhysicalCore::Stop() {
-    arm_interface->PrepareReschedule();
+bool PhysicalCore::IsInterrupted() const {
+    return interrupt_handler.IsInterrupted();
 }
 
-void PhysicalCore::Shutdown() {
-    scheduler->Shutdown();
+void PhysicalCore::Interrupt() {
+    guard->lock();
+    interrupt_handler.SetInterrupt(true);
+    guard->unlock();
 }
 
-void PhysicalCore::SetIs64Bit(bool is_64_bit) {
-    if (is_64_bit) {
-        arm_interface = arm_interface_64.get();
-    } else {
-        arm_interface = arm_interface_32.get();
-    }
+void PhysicalCore::ClearInterrupt() {
+    guard->lock();
+    interrupt_handler.SetInterrupt(false);
+    guard->unlock();
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/physical_core.h b/src/core/hle/kernel/physical_core.h
index 3269166be..d7a7a951c 100644
--- a/src/core/hle/kernel/physical_core.h
+++ b/src/core/hle/kernel/physical_core.h
@@ -7,12 +7,17 @@
 #include <cstddef>
 #include <memory>
 
+namespace Common {
+class SpinLock;
+}
+
 namespace Kernel {
 class Scheduler;
 } // namespace Kernel
 
 namespace Core {
 class ARM_Interface;
+class CPUInterruptHandler;
 class ExclusiveMonitor;
 class System;
 } // namespace Core
@@ -21,7 +26,8 @@ namespace Kernel {
 
 class PhysicalCore {
 public:
-    PhysicalCore(Core::System& system, std::size_t id, Core::ExclusiveMonitor& exclusive_monitor);
+    PhysicalCore(Core::System& system, std::size_t id, Kernel::Scheduler& scheduler,
+                 Core::CPUInterruptHandler& interrupt_handler);
     ~PhysicalCore();
 
     PhysicalCore(const PhysicalCore&) = delete;
@@ -30,23 +36,18 @@ public:
     PhysicalCore(PhysicalCore&&) = default;
     PhysicalCore& operator=(PhysicalCore&&) = default;
 
-    /// Execute current jit state
-    void Run();
-    /// Execute a single instruction in current jit.
-    void Step();
-    /// Stop JIT execution/exit
-    void Stop();
+    void Idle();
+    /// Interrupt this physical core.
+    void Interrupt();
 
-    // Shutdown this physical core.
-    void Shutdown();
+    /// Clear this core's interrupt
+    void ClearInterrupt();
 
-    Core::ARM_Interface& ArmInterface() {
-        return *arm_interface;
-    }
+    /// Check if this core is interrupted
+    bool IsInterrupted() const;
 
-    const Core::ARM_Interface& ArmInterface() const {
-        return *arm_interface;
-    }
+    // Shutdown this physical core.
+    void Shutdown();
 
     bool IsMainCore() const {
         return core_index == 0;
@@ -61,21 +62,18 @@ public:
     }
 
     Kernel::Scheduler& Scheduler() {
-        return *scheduler;
+        return scheduler;
     }
 
     const Kernel::Scheduler& Scheduler() const {
-        return *scheduler;
+        return scheduler;
     }
 
-    void SetIs64Bit(bool is_64_bit);
-
 private:
+    Core::CPUInterruptHandler& interrupt_handler;
     std::size_t core_index;
-    std::unique_ptr<Core::ARM_Interface> arm_interface_32;
-    std::unique_ptr<Core::ARM_Interface> arm_interface_64;
-    std::unique_ptr<Kernel::Scheduler> scheduler;
-    Core::ARM_Interface* arm_interface{};
+    Kernel::Scheduler& scheduler;
+    std::unique_ptr<Common::SpinLock> guard;
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 36724569f..c6fcb56ad 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -22,6 +22,7 @@
 #include "core/hle/kernel/resource_limit.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/lock.h"
 #include "core/memory.h"
 #include "core/settings.h"
 
@@ -30,14 +31,15 @@ namespace {
 /**
  * Sets up the primary application thread
  *
+ * @param system The system instance to create the main thread under.
  * @param owner_process The parent process for the main thread
- * @param kernel The kernel instance to create the main thread under.
  * @param priority The priority to give the main thread
  */
-void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority, VAddr stack_top) {
+void SetupMainThread(Core::System& system, Process& owner_process, u32 priority, VAddr stack_top) {
     const VAddr entry_point = owner_process.PageTable().GetCodeRegionStart();
-    auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0,
-                                     owner_process.GetIdealCore(), stack_top, owner_process);
+    ThreadType type = THREADTYPE_USER;
+    auto thread_res = Thread::Create(system, type, "main", entry_point, priority, 0,
+                                     owner_process.GetIdealCore(), stack_top, &owner_process);
 
     std::shared_ptr<Thread> thread = std::move(thread_res).Unwrap();
 
@@ -48,8 +50,12 @@ void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority, V
     thread->GetContext32().cpu_registers[1] = thread_handle;
     thread->GetContext64().cpu_registers[1] = thread_handle;
 
+    auto& kernel = system.Kernel();
     // Threads by default are dormant, wake up the main thread so it runs when the scheduler fires
-    thread->ResumeFromWait();
+    {
+        SchedulerLock lock{kernel};
+        thread->SetStatus(ThreadStatus::Ready);
+    }
 }
 } // Anonymous namespace
 
@@ -117,7 +123,7 @@ std::shared_ptr<Process> Process::Create(Core::System& system, std::string name,
                                                               : kernel.CreateNewUserProcessID();
     process->capabilities.InitializeForMetadatalessProcess();
 
-    std::mt19937 rng(Settings::values.rng_seed.value_or(0));
+    std::mt19937 rng(Settings::values.rng_seed.GetValue().value_or(0));
     std::uniform_int_distribution<u64> distribution;
     std::generate(process->random_entropy.begin(), process->random_entropy.end(),
                   [&] { return distribution(rng); });
@@ -132,7 +138,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const {
 
 u64 Process::GetTotalPhysicalMemoryAvailable() const {
     const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) +
-                       page_table->GetTotalHeapSize() + image_size + main_thread_stack_size};
+                       page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size +
+                       main_thread_stack_size};
 
     if (capacity < memory_usage_capacity) {
         return capacity;
@@ -146,7 +153,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
 }
 
 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize();
+    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() +
+           GetSystemResourceSize();
 }
 
 u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
@@ -180,7 +188,6 @@ void Process::RemoveConditionVariableThread(std::shared_ptr<Thread> thread) {
         }
         ++it;
     }
-    UNREACHABLE();
 }
 
 std::vector<std::shared_ptr<Thread>> Process::GetConditionVariableThreads(
@@ -205,6 +212,7 @@ void Process::UnregisterThread(const Thread* thread) {
 }
 
 ResultCode Process::ClearSignalState() {
+    SchedulerLock lock(system.Kernel());
     if (status == ProcessStatus::Exited) {
         LOG_ERROR(Kernel, "called on a terminated process instance.");
         return ERR_INVALID_STATE;
@@ -292,7 +300,7 @@ void Process::Run(s32 main_thread_priority, u64 stack_size) {
 
     ChangeStatus(ProcessStatus::Running);
 
-    SetupMainThread(*this, kernel, main_thread_priority, main_thread_stack_top);
+    SetupMainThread(system, *this, main_thread_priority, main_thread_stack_top);
     resource_limit->Reserve(ResourceType::Threads, 1);
     resource_limit->Reserve(ResourceType::PhysicalMemory, main_thread_stack_size);
 }
@@ -338,6 +346,7 @@ static auto FindTLSPageWithAvailableSlots(std::vector<TLSPage>& tls_pages) {
 }
 
 VAddr Process::CreateTLSRegion() {
+    SchedulerLock lock(system.Kernel());
     if (auto tls_page_iter{FindTLSPageWithAvailableSlots(tls_pages)};
         tls_page_iter != tls_pages.cend()) {
         return *tls_page_iter->ReserveSlot();
@@ -368,6 +377,7 @@ VAddr Process::CreateTLSRegion() {
 }
 
 void Process::FreeTLSRegion(VAddr tls_address) {
+    SchedulerLock lock(system.Kernel());
     const VAddr aligned_address = Common::AlignDown(tls_address, Core::Memory::PAGE_SIZE);
     auto iter =
         std::find_if(tls_pages.begin(), tls_pages.end(), [aligned_address](const auto& page) {
@@ -382,6 +392,7 @@ void Process::FreeTLSRegion(VAddr tls_address) {
 }
 
 void Process::LoadModule(CodeSet code_set, VAddr base_addr) {
+    std::lock_guard lock{HLE::g_hle_lock};
     const auto ReprotectSegment = [&](const CodeSet::Segment& segment,
                                       Memory::MemoryPermission permission) {
         page_table->SetCodeMemoryPermission(segment.addr + base_addr, segment.size, permission);
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp
index 00860fcbd..6e286419e 100644
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -6,8 +6,10 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
 
 namespace Kernel {
@@ -37,8 +39,9 @@ void ReadableEvent::Clear() {
 }
 
 ResultCode ReadableEvent::Reset() {
+    SchedulerLock lock(kernel);
     if (!is_signaled) {
-        LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
+        LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
                   GetObjectId(), GetTypeName(), GetName());
         return ERR_INVALID_STATE;
     }
diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp
index d9beaa3a4..212e442f4 100644
--- a/src/core/hle/kernel/resource_limit.cpp
+++ b/src/core/hle/kernel/resource_limit.cpp
@@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) {
     const std::size_t index{ResourceTypeToIndex(resource)};
 
     s64 new_value = current[index] + amount;
-    while (new_value > limit[index] && available[index] + amount <= limit[index]) {
+    if (new_value > limit[index] && available[index] + amount <= limit[index]) {
         // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout
         new_value = current[index] + amount;
-
-        if (timeout >= 0) {
-            break;
-        }
     }
 
     if (new_value <= limit[index]) {
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 1140c72a3..7b929781c 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -6,16 +6,21 @@
 // licensed under GPLv2 or later under exception provided by the author.
 
 #include <algorithm>
+#include <mutex>
 #include <set>
 #include <unordered_set>
 #include <utility>
 
 #include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/fiber.h"
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/cpu_manager.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/time_manager.h"
@@ -27,103 +32,148 @@ GlobalScheduler::GlobalScheduler(KernelCore& kernel) : kernel{kernel} {}
 GlobalScheduler::~GlobalScheduler() = default;
 
 void GlobalScheduler::AddThread(std::shared_ptr<Thread> thread) {
+    std::scoped_lock lock{global_list_guard};
     thread_list.push_back(std::move(thread));
 }
 
 void GlobalScheduler::RemoveThread(std::shared_ptr<Thread> thread) {
+    std::scoped_lock lock{global_list_guard};
     thread_list.erase(std::remove(thread_list.begin(), thread_list.end(), thread),
                       thread_list.end());
 }
 
-void GlobalScheduler::UnloadThread(std::size_t core) {
-    Scheduler& sched = kernel.Scheduler(core);
-    sched.UnloadThread();
-}
-
-void GlobalScheduler::SelectThread(std::size_t core) {
+u32 GlobalScheduler::SelectThreads() {
+    ASSERT(is_locked);
     const auto update_thread = [](Thread* thread, Scheduler& sched) {
-        if (thread != sched.selected_thread.get()) {
+        std::scoped_lock lock{sched.guard};
+        if (thread != sched.selected_thread_set.get()) {
             if (thread == nullptr) {
                 ++sched.idle_selection_count;
             }
-            sched.selected_thread = SharedFrom(thread);
+            sched.selected_thread_set = SharedFrom(thread);
         }
-        sched.is_context_switch_pending = sched.selected_thread != sched.current_thread;
+        const bool reschedule_pending =
+            sched.is_context_switch_pending || (sched.selected_thread_set != sched.current_thread);
+        sched.is_context_switch_pending = reschedule_pending;
         std::atomic_thread_fence(std::memory_order_seq_cst);
+        return reschedule_pending;
     };
-    Scheduler& sched = kernel.Scheduler(core);
-    Thread* current_thread = nullptr;
-    // Step 1: Get top thread in schedule queue.
-    current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
-    if (current_thread) {
-        update_thread(current_thread, sched);
-        return;
+    if (!is_reselection_pending.load()) {
+        return 0;
     }
-    // Step 2: Try selecting a suggested thread.
-    Thread* winner = nullptr;
-    std::set<s32> sug_cores;
-    for (auto thread : suggested_queue[core]) {
-        s32 this_core = thread->GetProcessorID();
-        Thread* thread_on_core = nullptr;
-        if (this_core >= 0) {
-            thread_on_core = scheduled_queue[this_core].front();
-        }
-        if (this_core < 0 || thread != thread_on_core) {
-            winner = thread;
-            break;
+    std::array<Thread*, Core::Hardware::NUM_CPU_CORES> top_threads{};
+
+    u32 idle_cores{};
+
+    // Step 1: Get top thread in schedule queue.
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        Thread* top_thread =
+            scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
+        if (top_thread != nullptr) {
+            // TODO(Blinkhawk): Implement Thread Pinning
+        } else {
+            idle_cores |= (1ul << core);
         }
-        sug_cores.insert(this_core);
+        top_threads[core] = top_thread;
     }
-    // if we got a suggested thread, select it, else do a second pass.
-    if (winner && winner->GetPriority() > 2) {
-        if (winner->IsRunning()) {
-            UnloadThread(static_cast<u32>(winner->GetProcessorID()));
+
+    while (idle_cores != 0) {
+        u32 core_id = Common::CountTrailingZeroes32(idle_cores);
+
+        if (!suggested_queue[core_id].empty()) {
+            std::array<s32, Core::Hardware::NUM_CPU_CORES> migration_candidates{};
+            std::size_t num_candidates = 0;
+            auto iter = suggested_queue[core_id].begin();
+            Thread* suggested = nullptr;
+            // Step 2: Try selecting a suggested thread.
+            while (iter != suggested_queue[core_id].end()) {
+                suggested = *iter;
+                iter++;
+                s32 suggested_core_id = suggested->GetProcessorID();
+                Thread* top_thread =
+                    suggested_core_id >= 0 ? top_threads[suggested_core_id] : nullptr;
+                if (top_thread != suggested) {
+                    if (top_thread != nullptr &&
+                        top_thread->GetPriority() < THREADPRIO_MAX_CORE_MIGRATION) {
+                        suggested = nullptr;
+                        break;
+                        // There's a too high thread to do core migration, cancel
+                    }
+                    TransferToCore(suggested->GetPriority(), static_cast<s32>(core_id), suggested);
+                    break;
+                }
+                suggested = nullptr;
+                migration_candidates[num_candidates++] = suggested_core_id;
+            }
+            // Step 3: Select a suggested thread from another core
+            if (suggested == nullptr) {
+                for (std::size_t i = 0; i < num_candidates; i++) {
+                    s32 candidate_core = migration_candidates[i];
+                    suggested = top_threads[candidate_core];
+                    auto it = scheduled_queue[candidate_core].begin();
+                    it++;
+                    Thread* next = it != scheduled_queue[candidate_core].end() ? *it : nullptr;
+                    if (next != nullptr) {
+                        TransferToCore(suggested->GetPriority(), static_cast<s32>(core_id),
+                                       suggested);
+                        top_threads[candidate_core] = next;
+                        break;
+                    } else {
+                        suggested = nullptr;
+                    }
+                }
+            }
+            top_threads[core_id] = suggested;
         }
-        TransferToCore(winner->GetPriority(), static_cast<s32>(core), winner);
-        update_thread(winner, sched);
-        return;
+
+        idle_cores &= ~(1ul << core_id);
     }
-    // Step 3: Select a suggested thread from another core
-    for (auto& src_core : sug_cores) {
-        auto it = scheduled_queue[src_core].begin();
-        it++;
-        if (it != scheduled_queue[src_core].end()) {
-            Thread* thread_on_core = scheduled_queue[src_core].front();
-            Thread* to_change = *it;
-            if (thread_on_core->IsRunning() || to_change->IsRunning()) {
-                UnloadThread(static_cast<u32>(src_core));
-            }
-            TransferToCore(thread_on_core->GetPriority(), static_cast<s32>(core), thread_on_core);
-            current_thread = thread_on_core;
-            break;
+    u32 cores_needing_context_switch{};
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        Scheduler& sched = kernel.Scheduler(core);
+        ASSERT(top_threads[core] == nullptr || top_threads[core]->GetProcessorID() == core);
+        if (update_thread(top_threads[core], sched)) {
+            cores_needing_context_switch |= (1ul << core);
         }
     }
-    update_thread(current_thread, sched);
+    return cores_needing_context_switch;
 }
 
 bool GlobalScheduler::YieldThread(Thread* yielding_thread) {
+    ASSERT(is_locked);
     // Note: caller should use critical section, etc.
+    if (!yielding_thread->IsRunnable()) {
+        // Normally this case shouldn't happen except for SetThreadActivity.
+        is_reselection_pending.store(true, std::memory_order_release);
+        return false;
+    }
     const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
     const u32 priority = yielding_thread->GetPriority();
 
     // Yield the thread
-    const Thread* const winner = scheduled_queue[core_id].front(priority);
-    ASSERT_MSG(yielding_thread == winner, "Thread yielding without being in front");
-    scheduled_queue[core_id].yield(priority);
+    Reschedule(priority, core_id, yielding_thread);
+    const Thread* const winner = scheduled_queue[core_id].front();
+    if (kernel.GetCurrentHostThreadID() != core_id) {
+        is_reselection_pending.store(true, std::memory_order_release);
+    }
 
     return AskForReselectionOrMarkRedundant(yielding_thread, winner);
 }
 
 bool GlobalScheduler::YieldThreadAndBalanceLoad(Thread* yielding_thread) {
+    ASSERT(is_locked);
     // Note: caller should check if !thread.IsSchedulerOperationRedundant and use critical section,
     // etc.
+    if (!yielding_thread->IsRunnable()) {
+        // Normally this case shouldn't happen except for SetThreadActivity.
+        is_reselection_pending.store(true, std::memory_order_release);
+        return false;
+    }
     const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
     const u32 priority = yielding_thread->GetPriority();
 
     // Yield the thread
-    ASSERT_MSG(yielding_thread == scheduled_queue[core_id].front(priority),
-               "Thread yielding without being in front");
-    scheduled_queue[core_id].yield(priority);
+    Reschedule(priority, core_id, yielding_thread);
 
     std::array<Thread*, Core::Hardware::NUM_CPU_CORES> current_threads;
     for (std::size_t i = 0; i < current_threads.size(); i++) {
@@ -153,21 +203,28 @@ bool GlobalScheduler::YieldThreadAndBalanceLoad(Thread* yielding_thread) {
 
     if (winner != nullptr) {
         if (winner != yielding_thread) {
-            if (winner->IsRunning()) {
-                UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-            }
             TransferToCore(winner->GetPriority(), s32(core_id), winner);
         }
     } else {
         winner = next_thread;
     }
 
+    if (kernel.GetCurrentHostThreadID() != core_id) {
+        is_reselection_pending.store(true, std::memory_order_release);
+    }
+
     return AskForReselectionOrMarkRedundant(yielding_thread, winner);
 }
 
 bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread) {
+    ASSERT(is_locked);
     // Note: caller should check if !thread.IsSchedulerOperationRedundant and use critical section,
     // etc.
+    if (!yielding_thread->IsRunnable()) {
+        // Normally this case shouldn't happen except for SetThreadActivity.
+        is_reselection_pending.store(true, std::memory_order_release);
+        return false;
+    }
     Thread* winner = nullptr;
     const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
 
@@ -195,25 +252,31 @@ bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread
         }
         if (winner != nullptr) {
             if (winner != yielding_thread) {
-                if (winner->IsRunning()) {
-                    UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-                }
                 TransferToCore(winner->GetPriority(), static_cast<s32>(core_id), winner);
             }
         } else {
             winner = yielding_thread;
         }
+    } else {
+        winner = scheduled_queue[core_id].front();
+    }
+
+    if (kernel.GetCurrentHostThreadID() != core_id) {
+        is_reselection_pending.store(true, std::memory_order_release);
     }
 
     return AskForReselectionOrMarkRedundant(yielding_thread, winner);
 }
 
 void GlobalScheduler::PreemptThreads() {
+    ASSERT(is_locked);
     for (std::size_t core_id = 0; core_id < Core::Hardware::NUM_CPU_CORES; core_id++) {
         const u32 priority = preemption_priorities[core_id];
 
         if (scheduled_queue[core_id].size(priority) > 0) {
-            scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            if (scheduled_queue[core_id].size(priority) > 1) {
+                scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            }
             scheduled_queue[core_id].yield(priority);
             if (scheduled_queue[core_id].size(priority) > 1) {
                 scheduled_queue[core_id].front(priority)->IncrementYieldCount();
@@ -247,9 +310,6 @@ void GlobalScheduler::PreemptThreads() {
         }
 
         if (winner != nullptr) {
-            if (winner->IsRunning()) {
-                UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-            }
             TransferToCore(winner->GetPriority(), s32(core_id), winner);
             current_thread =
                 winner->GetPriority() <= current_thread->GetPriority() ? winner : current_thread;
@@ -280,9 +340,6 @@ void GlobalScheduler::PreemptThreads() {
             }
 
             if (winner != nullptr) {
-                if (winner->IsRunning()) {
-                    UnloadThread(static_cast<u32>(winner->GetProcessorID()));
-                }
                 TransferToCore(winner->GetPriority(), s32(core_id), winner);
                 current_thread = winner;
             }
@@ -292,34 +349,65 @@ void GlobalScheduler::PreemptThreads() {
     }
 }
 
+void GlobalScheduler::EnableInterruptAndSchedule(u32 cores_pending_reschedule,
+                                                 Core::EmuThreadHandle global_thread) {
+    u32 current_core = global_thread.host_handle;
+    bool must_context_switch = global_thread.guest_handle != InvalidHandle &&
+                               (current_core < Core::Hardware::NUM_CPU_CORES);
+    while (cores_pending_reschedule != 0) {
+        u32 core = Common::CountTrailingZeroes32(cores_pending_reschedule);
+        ASSERT(core < Core::Hardware::NUM_CPU_CORES);
+        if (!must_context_switch || core != current_core) {
+            auto& phys_core = kernel.PhysicalCore(core);
+            phys_core.Interrupt();
+        } else {
+            must_context_switch = true;
+        }
+        cores_pending_reschedule &= ~(1ul << core);
+    }
+    if (must_context_switch) {
+        auto& core_scheduler = kernel.CurrentScheduler();
+        kernel.ExitSVCProfile();
+        core_scheduler.TryDoContextSwitch();
+        kernel.EnterSVCProfile();
+    }
+}
+
 void GlobalScheduler::Suggest(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     suggested_queue[core].add(thread, priority);
 }
 
 void GlobalScheduler::Unsuggest(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     suggested_queue[core].remove(thread, priority);
 }
 
 void GlobalScheduler::Schedule(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     ASSERT_MSG(thread->GetProcessorID() == s32(core), "Thread must be assigned to this core.");
     scheduled_queue[core].add(thread, priority);
 }
 
 void GlobalScheduler::SchedulePrepend(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     ASSERT_MSG(thread->GetProcessorID() == s32(core), "Thread must be assigned to this core.");
     scheduled_queue[core].add(thread, priority, false);
 }
 
 void GlobalScheduler::Reschedule(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     scheduled_queue[core].remove(thread, priority);
     scheduled_queue[core].add(thread, priority);
 }
 
 void GlobalScheduler::Unschedule(u32 priority, std::size_t core, Thread* thread) {
+    ASSERT(is_locked);
     scheduled_queue[core].remove(thread, priority);
 }
 
 void GlobalScheduler::TransferToCore(u32 priority, s32 destination_core, Thread* thread) {
+    ASSERT(is_locked);
     const bool schedulable = thread->GetPriority() < THREADPRIO_COUNT;
     const s32 source_core = thread->GetProcessorID();
     if (source_core == destination_core || !schedulable) {
@@ -349,6 +437,108 @@ bool GlobalScheduler::AskForReselectionOrMarkRedundant(Thread* current_thread,
     }
 }
 
+void GlobalScheduler::AdjustSchedulingOnStatus(Thread* thread, u32 old_flags) {
+    if (old_flags == thread->scheduling_state) {
+        return;
+    }
+    ASSERT(is_locked);
+
+    if (old_flags == static_cast<u32>(ThreadSchedStatus::Runnable)) {
+        // In this case the thread was running, now it's pausing/exitting
+        if (thread->processor_id >= 0) {
+            Unschedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            if (core != static_cast<u32>(thread->processor_id) &&
+                ((thread->affinity_mask >> core) & 1) != 0) {
+                Unsuggest(thread->current_priority, core, thread);
+            }
+        }
+    } else if (thread->scheduling_state == static_cast<u32>(ThreadSchedStatus::Runnable)) {
+        // The thread is now set to running from being stopped
+        if (thread->processor_id >= 0) {
+            Schedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            if (core != static_cast<u32>(thread->processor_id) &&
+                ((thread->affinity_mask >> core) & 1) != 0) {
+                Suggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    SetReselectionPending();
+}
+
+void GlobalScheduler::AdjustSchedulingOnPriority(Thread* thread, u32 old_priority) {
+    if (thread->scheduling_state != static_cast<u32>(ThreadSchedStatus::Runnable)) {
+        return;
+    }
+    ASSERT(is_locked);
+    if (thread->processor_id >= 0) {
+        Unschedule(old_priority, static_cast<u32>(thread->processor_id), thread);
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (core != static_cast<u32>(thread->processor_id) &&
+            ((thread->affinity_mask >> core) & 1) != 0) {
+            Unsuggest(old_priority, core, thread);
+        }
+    }
+
+    if (thread->processor_id >= 0) {
+        if (thread == kernel.CurrentScheduler().GetCurrentThread()) {
+            SchedulePrepend(thread->current_priority, static_cast<u32>(thread->processor_id),
+                            thread);
+        } else {
+            Schedule(thread->current_priority, static_cast<u32>(thread->processor_id), thread);
+        }
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (core != static_cast<u32>(thread->processor_id) &&
+            ((thread->affinity_mask >> core) & 1) != 0) {
+            Suggest(thread->current_priority, core, thread);
+        }
+    }
+    thread->IncrementYieldCount();
+    SetReselectionPending();
+}
+
+void GlobalScheduler::AdjustSchedulingOnAffinity(Thread* thread, u64 old_affinity_mask,
+                                                 s32 old_core) {
+    if (thread->scheduling_state != static_cast<u32>(ThreadSchedStatus::Runnable) ||
+        thread->current_priority >= THREADPRIO_COUNT) {
+        return;
+    }
+    ASSERT(is_locked);
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (((old_affinity_mask >> core) & 1) != 0) {
+            if (core == static_cast<u32>(old_core)) {
+                Unschedule(thread->current_priority, core, thread);
+            } else {
+                Unsuggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+        if (((thread->affinity_mask >> core) & 1) != 0) {
+            if (core == static_cast<u32>(thread->processor_id)) {
+                Schedule(thread->current_priority, core, thread);
+            } else {
+                Suggest(thread->current_priority, core, thread);
+            }
+        }
+    }
+
+    thread->IncrementYieldCount();
+    SetReselectionPending();
+}
+
 void GlobalScheduler::Shutdown() {
     for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         scheduled_queue[core].clear();
@@ -359,10 +549,12 @@ void GlobalScheduler::Shutdown() {
 
 void GlobalScheduler::Lock() {
     Core::EmuThreadHandle current_thread = kernel.GetCurrentEmuThreadID();
+    ASSERT(!current_thread.IsInvalid());
     if (current_thread == current_owner) {
         ++scope_lock;
     } else {
         inner_lock.lock();
+        is_locked = true;
         current_owner = current_thread;
         ASSERT(current_owner != Core::EmuThreadHandle::InvalidHandle());
         scope_lock = 1;
@@ -374,17 +566,18 @@ void GlobalScheduler::Unlock() {
         ASSERT(scope_lock > 0);
         return;
     }
-    for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
-        SelectThread(i);
-    }
+    u32 cores_pending_reschedule = SelectThreads();
+    Core::EmuThreadHandle leaving_thread = current_owner;
     current_owner = Core::EmuThreadHandle::InvalidHandle();
     scope_lock = 1;
+    is_locked = false;
     inner_lock.unlock();
-    // TODO(Blinkhawk): Setup the interrupts and change context on current core.
+    EnableInterruptAndSchedule(cores_pending_reschedule, leaving_thread);
 }
 
-Scheduler::Scheduler(Core::System& system, std::size_t core_id)
-    : system{system}, core_id{core_id} {}
+Scheduler::Scheduler(Core::System& system, std::size_t core_id) : system(system), core_id(core_id) {
+    switch_fiber = std::make_shared<Common::Fiber>(std::function<void(void*)>(OnSwitch), this);
+}
 
 Scheduler::~Scheduler() = default;
 
@@ -393,56 +586,128 @@ bool Scheduler::HaveReadyThreads() const {
 }
 
 Thread* Scheduler::GetCurrentThread() const {
-    return current_thread.get();
+    if (current_thread) {
+        return current_thread.get();
+    }
+    return idle_thread.get();
 }
 
 Thread* Scheduler::GetSelectedThread() const {
     return selected_thread.get();
 }
 
-void Scheduler::SelectThreads() {
-    system.GlobalScheduler().SelectThread(core_id);
-}
-
 u64 Scheduler::GetLastContextSwitchTicks() const {
     return last_context_switch_time;
 }
 
 void Scheduler::TryDoContextSwitch() {
+    auto& phys_core = system.Kernel().CurrentPhysicalCore();
+    if (phys_core.IsInterrupted()) {
+        phys_core.ClearInterrupt();
+    }
+    guard.lock();
     if (is_context_switch_pending) {
         SwitchContext();
+    } else {
+        guard.unlock();
     }
 }
 
-void Scheduler::UnloadThread() {
-    Thread* const previous_thread = GetCurrentThread();
-    Process* const previous_process = system.Kernel().CurrentProcess();
+void Scheduler::OnThreadStart() {
+    SwitchContextStep2();
+}
 
-    UpdateLastContextSwitchTime(previous_thread, previous_process);
+void Scheduler::Unload() {
+    Thread* thread = current_thread.get();
+    if (thread) {
+        thread->SetContinuousOnSVC(false);
+        thread->last_running_ticks = system.CoreTiming().GetCPUTicks();
+        thread->SetIsRunning(false);
+        if (!thread->IsHLEThread() && !thread->HasExited()) {
+            Core::ARM_Interface& cpu_core = thread->ArmInterface();
+            cpu_core.SaveContext(thread->GetContext32());
+            cpu_core.SaveContext(thread->GetContext64());
+            // Save the TPIDR_EL0 system register in case it was modified.
+            thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+            cpu_core.ClearExclusiveState();
+        }
+        thread->context_guard.unlock();
+    }
+}
 
-    // Save context for previous thread
-    if (previous_thread) {
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
-        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());
+void Scheduler::Reload() {
+    Thread* thread = current_thread.get();
+    if (thread) {
+        ASSERT_MSG(thread->GetSchedulingStatus() == ThreadSchedStatus::Runnable,
+                   "Thread must be runnable.");
+
+        // Cancel any outstanding wakeup events for this thread
+        thread->SetIsRunning(true);
+        thread->SetWasRunning(false);
+        thread->last_running_ticks = system.CoreTiming().GetCPUTicks();
 
-        if (previous_thread->GetStatus() == ThreadStatus::Running) {
-            // This is only the case when a reschedule is triggered without the current thread
-            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
-            previous_thread->SetStatus(ThreadStatus::Ready);
+        auto* const thread_owner_process = thread->GetOwnerProcess();
+        if (thread_owner_process != nullptr) {
+            system.Kernel().MakeCurrentProcess(thread_owner_process);
+        }
+        if (!thread->IsHLEThread()) {
+            Core::ARM_Interface& cpu_core = thread->ArmInterface();
+            cpu_core.LoadContext(thread->GetContext32());
+            cpu_core.LoadContext(thread->GetContext64());
+            cpu_core.SetTlsAddress(thread->GetTLSAddress());
+            cpu_core.SetTPIDR_EL0(thread->GetTPIDR_EL0());
+            cpu_core.ChangeProcessorID(this->core_id);
+            cpu_core.ClearExclusiveState();
         }
-        previous_thread->SetIsRunning(false);
     }
-    current_thread = nullptr;
+}
+
+void Scheduler::SwitchContextStep2() {
+    Thread* previous_thread = current_thread_prev.get();
+    Thread* new_thread = selected_thread.get();
+
+    // Load context of new thread
+    Process* const previous_process =
+        previous_thread != nullptr ? previous_thread->GetOwnerProcess() : nullptr;
+
+    if (new_thread) {
+        ASSERT_MSG(new_thread->GetSchedulingStatus() == ThreadSchedStatus::Runnable,
+                   "Thread must be runnable.");
+
+        // Cancel any outstanding wakeup events for this thread
+        new_thread->SetIsRunning(true);
+        new_thread->last_running_ticks = system.CoreTiming().GetCPUTicks();
+        new_thread->SetWasRunning(false);
+
+        auto* const thread_owner_process = current_thread->GetOwnerProcess();
+        if (thread_owner_process != nullptr) {
+            system.Kernel().MakeCurrentProcess(thread_owner_process);
+        }
+        if (!new_thread->IsHLEThread()) {
+            Core::ARM_Interface& cpu_core = new_thread->ArmInterface();
+            cpu_core.LoadContext(new_thread->GetContext32());
+            cpu_core.LoadContext(new_thread->GetContext64());
+            cpu_core.SetTlsAddress(new_thread->GetTLSAddress());
+            cpu_core.SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
+            cpu_core.ChangeProcessorID(this->core_id);
+            cpu_core.ClearExclusiveState();
+        }
+    }
+
+    TryDoContextSwitch();
 }
 
 void Scheduler::SwitchContext() {
-    Thread* const previous_thread = GetCurrentThread();
-    Thread* const new_thread = GetSelectedThread();
+    current_thread_prev = current_thread;
+    selected_thread = selected_thread_set;
+    Thread* previous_thread = current_thread_prev.get();
+    Thread* new_thread = selected_thread.get();
+    current_thread = selected_thread;
 
     is_context_switch_pending = false;
+
     if (new_thread == previous_thread) {
+        guard.unlock();
         return;
     }
 
@@ -452,51 +717,76 @@ void Scheduler::SwitchContext() {
 
     // Save context for previous thread
     if (previous_thread) {
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
-        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
-        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());
-
-        if (previous_thread->GetStatus() == ThreadStatus::Running) {
-            // This is only the case when a reschedule is triggered without the current thread
-            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
-            previous_thread->SetStatus(ThreadStatus::Ready);
+        if (new_thread != nullptr && new_thread->IsSuspendThread()) {
+            previous_thread->SetWasRunning(true);
         }
+        previous_thread->SetContinuousOnSVC(false);
+        previous_thread->last_running_ticks = system.CoreTiming().GetCPUTicks();
         previous_thread->SetIsRunning(false);
+        if (!previous_thread->IsHLEThread() && !previous_thread->HasExited()) {
+            Core::ARM_Interface& cpu_core = previous_thread->ArmInterface();
+            cpu_core.SaveContext(previous_thread->GetContext32());
+            cpu_core.SaveContext(previous_thread->GetContext64());
+            // Save the TPIDR_EL0 system register in case it was modified.
+            previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+            cpu_core.ClearExclusiveState();
+        }
+        previous_thread->context_guard.unlock();
     }
 
-    // Load context of new thread
-    if (new_thread) {
-        ASSERT_MSG(new_thread->GetProcessorID() == s32(this->core_id),
-                   "Thread must be assigned to this core.");
-        ASSERT_MSG(new_thread->GetStatus() == ThreadStatus::Ready,
-                   "Thread must be ready to become running.");
+    std::shared_ptr<Common::Fiber>* old_context;
+    if (previous_thread != nullptr) {
+        old_context = &previous_thread->GetHostContext();
+    } else {
+        old_context = &idle_thread->GetHostContext();
+    }
+    guard.unlock();
 
-        // Cancel any outstanding wakeup events for this thread
-        new_thread->CancelWakeupTimer();
-        current_thread = SharedFrom(new_thread);
-        new_thread->SetStatus(ThreadStatus::Running);
-        new_thread->SetIsRunning(true);
+    Common::Fiber::YieldTo(*old_context, switch_fiber);
+    /// When a thread wakes up, the scheduler may have changed to other in another core.
+    auto& next_scheduler = system.Kernel().CurrentScheduler();
+    next_scheduler.SwitchContextStep2();
+}
 
-        auto* const thread_owner_process = current_thread->GetOwnerProcess();
-        if (previous_process != thread_owner_process) {
-            system.Kernel().MakeCurrentProcess(thread_owner_process);
-        }
+void Scheduler::OnSwitch(void* this_scheduler) {
+    Scheduler* sched = static_cast<Scheduler*>(this_scheduler);
+    sched->SwitchToCurrent();
+}
 
-        system.ArmInterface(core_id).LoadContext(new_thread->GetContext32());
-        system.ArmInterface(core_id).LoadContext(new_thread->GetContext64());
-        system.ArmInterface(core_id).SetTlsAddress(new_thread->GetTLSAddress());
-        system.ArmInterface(core_id).SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
-    } else {
-        current_thread = nullptr;
-        // Note: We do not reset the current process and current page table when idling because
-        // technically we haven't changed processes, our threads are just paused.
+void Scheduler::SwitchToCurrent() {
+    while (true) {
+        {
+            std::scoped_lock lock{guard};
+            selected_thread = selected_thread_set;
+            current_thread = selected_thread;
+            is_context_switch_pending = false;
+        }
+        while (!is_context_switch_pending) {
+            if (current_thread != nullptr && !current_thread->IsHLEThread()) {
+                current_thread->context_guard.lock();
+                if (!current_thread->IsRunnable()) {
+                    current_thread->context_guard.unlock();
+                    break;
+                }
+                if (current_thread->GetProcessorID() != core_id) {
+                    current_thread->context_guard.unlock();
+                    break;
+                }
+            }
+            std::shared_ptr<Common::Fiber>* next_context;
+            if (current_thread != nullptr) {
+                next_context = &current_thread->GetHostContext();
+            } else {
+                next_context = &idle_thread->GetHostContext();
+            }
+            Common::Fiber::YieldTo(switch_fiber, *next_context);
+        }
     }
 }
 
 void Scheduler::UpdateLastContextSwitchTime(Thread* thread, Process* process) {
     const u64 prev_switch_ticks = last_context_switch_time;
-    const u64 most_recent_switch_ticks = system.CoreTiming().GetTicks();
+    const u64 most_recent_switch_ticks = system.CoreTiming().GetCPUTicks();
     const u64 update_ticks = most_recent_switch_ticks - prev_switch_ticks;
 
     if (thread != nullptr) {
@@ -510,6 +800,16 @@ void Scheduler::UpdateLastContextSwitchTime(Thread* thread, Process* process) {
     last_context_switch_time = most_recent_switch_ticks;
 }
 
+void Scheduler::Initialize() {
+    std::string name = "Idle Thread Id:" + std::to_string(core_id);
+    std::function<void(void*)> init_func = system.GetCpuManager().GetIdleThreadStartFunc();
+    void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+    ThreadType type = static_cast<ThreadType>(THREADTYPE_KERNEL | THREADTYPE_HLE | THREADTYPE_IDLE);
+    auto thread_res = Thread::Create(system, type, name, 0, 64, 0, static_cast<u32>(core_id), 0,
+                                     nullptr, std::move(init_func), init_func_parameter);
+    idle_thread = std::move(thread_res).Unwrap();
+}
+
 void Scheduler::Shutdown() {
     current_thread = nullptr;
     selected_thread = nullptr;
@@ -538,4 +838,13 @@ SchedulerLockAndSleep::~SchedulerLockAndSleep() {
     time_manager.ScheduleTimeEvent(event_handle, time_task, nanoseconds);
 }
 
+void SchedulerLockAndSleep::Release() {
+    if (sleep_cancelled) {
+        return;
+    }
+    auto& time_manager = kernel.TimeManager();
+    time_manager.ScheduleTimeEvent(event_handle, time_task, nanoseconds);
+    sleep_cancelled = true;
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index 07df33f9c..b3b4b5169 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -11,9 +11,14 @@
 
 #include "common/common_types.h"
 #include "common/multi_level_queue.h"
+#include "common/spin_lock.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/thread.h"
 
+namespace Common {
+class Fiber;
+}
+
 namespace Core {
 class ARM_Interface;
 class System;
@@ -41,41 +46,17 @@ public:
         return thread_list;
     }
 
-    /**
-     * Add a thread to the suggested queue of a cpu core. Suggested threads may be
-     * picked if no thread is scheduled to run on the core.
-     */
-    void Suggest(u32 priority, std::size_t core, Thread* thread);
-
-    /**
-     * Remove a thread to the suggested queue of a cpu core. Suggested threads may be
-     * picked if no thread is scheduled to run on the core.
-     */
-    void Unsuggest(u32 priority, std::size_t core, Thread* thread);
-
-    /**
-     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
-     * back the queue in its priority level.
-     */
-    void Schedule(u32 priority, std::size_t core, Thread* thread);
-
-    /**
-     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
-     * front the queue in its priority level.
-     */
-    void SchedulePrepend(u32 priority, std::size_t core, Thread* thread);
+    /// Notify the scheduler a thread's status has changed.
+    void AdjustSchedulingOnStatus(Thread* thread, u32 old_flags);
 
-    /// Reschedule an already scheduled thread based on a new priority
-    void Reschedule(u32 priority, std::size_t core, Thread* thread);
-
-    /// Unschedules a thread.
-    void Unschedule(u32 priority, std::size_t core, Thread* thread);
+    /// Notify the scheduler a thread's priority has changed.
+    void AdjustSchedulingOnPriority(Thread* thread, u32 old_priority);
 
-    /// Selects a core and forces it to unload its current thread's context
-    void UnloadThread(std::size_t core);
+    /// Notify the scheduler a thread's core and/or affinity mask has changed.
+    void AdjustSchedulingOnAffinity(Thread* thread, u64 old_affinity_mask, s32 old_core);
 
     /**
-     * Takes care of selecting the new scheduled thread in three steps:
+     * Takes care of selecting the new scheduled threads in three steps:
      *
      * 1. First a thread is selected from the top of the priority queue. If no thread
      *    is obtained then we move to step two, else we are done.
@@ -85,8 +66,10 @@ public:
      *
      * 3. Third is no suggested thread is found, we do a second pass and pick a running
      *    thread in another core and swap it with its current thread.
+     *
+     * returns the cores needing scheduling.
      */
-    void SelectThread(std::size_t core);
+    u32 SelectThreads();
 
     bool HaveReadyThreads(std::size_t core_id) const {
         return !scheduled_queue[core_id].empty();
@@ -149,6 +132,40 @@ private:
     /// Unlocks the scheduler, reselects threads, interrupts cores for rescheduling
     /// and reschedules current core if needed.
     void Unlock();
+
+    void EnableInterruptAndSchedule(u32 cores_pending_reschedule,
+                                    Core::EmuThreadHandle global_thread);
+
+    /**
+     * Add a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
+     */
+    void Suggest(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Remove a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
+     */
+    void Unsuggest(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * back the queue in its priority level.
+     */
+    void Schedule(u32 priority, std::size_t core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * front the queue in its priority level.
+     */
+    void SchedulePrepend(u32 priority, std::size_t core, Thread* thread);
+
+    /// Reschedule an already scheduled thread based on a new priority
+    void Reschedule(u32 priority, std::size_t core, Thread* thread);
+
+    /// Unschedules a thread.
+    void Unschedule(u32 priority, std::size_t core, Thread* thread);
+
     /**
      * Transfers a thread into an specific core. If the destination_core is -1
      * it will be unscheduled from its source code and added into its suggested
@@ -170,10 +187,13 @@ private:
     std::array<u32, Core::Hardware::NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};
 
     /// Scheduler lock mechanisms.
-    std::mutex inner_lock{}; // TODO(Blinkhawk): Replace for a SpinLock
+    bool is_locked{};
+    Common::SpinLock inner_lock{};
     std::atomic<s64> scope_lock{};
     Core::EmuThreadHandle current_owner{Core::EmuThreadHandle::InvalidHandle()};
 
+    Common::SpinLock global_list_guard{};
+
     /// Lists all thread ids that aren't deleted/etc.
     std::vector<std::shared_ptr<Thread>> thread_list;
     KernelCore& kernel;
@@ -190,11 +210,11 @@ public:
     /// Reschedules to the next available thread (call after current thread is suspended)
     void TryDoContextSwitch();
 
-    /// Unloads currently running thread
-    void UnloadThread();
-
-    /// Select the threads in top of the scheduling multilist.
-    void SelectThreads();
+    /// The next two are for SingleCore Only.
+    /// Unload current thread before preempting core.
+    void Unload();
+    /// Reload current thread after core preemption.
+    void Reload();
 
     /// Gets the current running thread
     Thread* GetCurrentThread() const;
@@ -209,15 +229,30 @@ public:
         return is_context_switch_pending;
     }
 
+    void Initialize();
+
     /// Shutdowns the scheduler.
     void Shutdown();
 
+    void OnThreadStart();
+
+    std::shared_ptr<Common::Fiber>& ControlContext() {
+        return switch_fiber;
+    }
+
+    const std::shared_ptr<Common::Fiber>& ControlContext() const {
+        return switch_fiber;
+    }
+
 private:
     friend class GlobalScheduler;
 
     /// Switches the CPU's active thread context to that of the specified thread
     void SwitchContext();
 
+    /// When a thread wakes up, it must run this through it's new scheduler
+    void SwitchContextStep2();
+
     /**
      * Called on every context switch to update the internal timestamp
      * This also updates the running time ticks for the given thread and
@@ -231,14 +266,24 @@ private:
      */
     void UpdateLastContextSwitchTime(Thread* thread, Process* process);
 
+    static void OnSwitch(void* this_scheduler);
+    void SwitchToCurrent();
+
     std::shared_ptr<Thread> current_thread = nullptr;
     std::shared_ptr<Thread> selected_thread = nullptr;
+    std::shared_ptr<Thread> current_thread_prev = nullptr;
+    std::shared_ptr<Thread> selected_thread_set = nullptr;
+    std::shared_ptr<Thread> idle_thread = nullptr;
+
+    std::shared_ptr<Common::Fiber> switch_fiber = nullptr;
 
     Core::System& system;
     u64 last_context_switch_time = 0;
     u64 idle_selection_count = 0;
     const std::size_t core_id;
 
+    Common::SpinLock guard{};
+
     bool is_context_switch_pending = false;
 };
 
@@ -261,6 +306,8 @@ public:
         sleep_cancelled = true;
     }
 
+    void Release();
+
 private:
     Handle& event_handle;
     Thread* time_task;
diff --git a/src/core/hle/kernel/server_session.cpp b/src/core/hle/kernel/server_session.cpp
index 25438b86b..7b23a6889 100644
--- a/src/core/hle/kernel/server_session.cpp
+++ b/src/core/hle/kernel/server_session.cpp
@@ -17,6 +17,7 @@
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/server_session.h"
 #include "core/hle/kernel/session.h"
 #include "core/hle/kernel/thread.h"
@@ -168,9 +169,12 @@ ResultCode ServerSession::CompleteSyncRequest() {
     }
 
     // Some service requests require the thread to block
-    if (!context.IsThreadWaiting()) {
-        context.GetThread().ResumeFromWait();
-        context.GetThread().SetWaitSynchronizationResult(result);
+    {
+        SchedulerLock lock(kernel);
+        if (!context.IsThreadWaiting()) {
+            context.GetThread().ResumeFromWait();
+            context.GetThread().SetSynchronizationResults(nullptr, result);
+        }
     }
 
     request_queue.Pop();
@@ -180,8 +184,10 @@ ResultCode ServerSession::CompleteSyncRequest() {
 
 ResultCode ServerSession::HandleSyncRequest(std::shared_ptr<Thread> thread,
                                             Core::Memory::Memory& memory) {
-    Core::System::GetInstance().CoreTiming().ScheduleEvent(20000, request_event, {});
-    return QueueSyncRequest(std::move(thread), memory);
+    ResultCode result = QueueSyncRequest(std::move(thread), memory);
+    const u64 delay = kernel.IsMulticore() ? 0U : 20000U;
+    Core::System::GetInstance().CoreTiming().ScheduleEvent(delay, request_event, {});
+    return result;
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 4ae4529f5..5db19dcf3 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -10,14 +10,15 @@
 
 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/fiber.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
@@ -27,6 +28,7 @@
 #include "core/hle/kernel/memory/memory_block.h"
 #include "core/hle/kernel/memory/page_table.h"
 #include "core/hle/kernel/mutex.h"
+#include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/resource_limit.h"
@@ -37,6 +39,7 @@
 #include "core/hle/kernel/svc_wrap.h"
 #include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/kernel/transfer_memory.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/lock.h"
@@ -133,6 +136,7 @@ enum class ResourceLimitValueType {
 
 ResultVal<s64> RetrieveResourceLimitValue(Core::System& system, Handle resource_limit,
                                           u32 resource_type, ResourceLimitValueType value_type) {
+    std::lock_guard lock{HLE::g_hle_lock};
     const auto type = static_cast<ResourceType>(resource_type);
     if (!IsValidResourceType(type)) {
         LOG_ERROR(Kernel_SVC, "Invalid resource limit type: '{}'", resource_type);
@@ -160,6 +164,7 @@ ResultVal<s64> RetrieveResourceLimitValue(Core::System& system, Handle resource_
 
 /// Set the process heap to a given Size. It can both extend and shrink the heap.
 static ResultCode SetHeapSize(Core::System& system, VAddr* heap_addr, u64 heap_size) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC, "called, heap_size=0x{:X}", heap_size);
 
     // Size must be a multiple of 0x200000 (2MB) and be equal to or less than 8GB.
@@ -190,6 +195,7 @@ static ResultCode SetHeapSize32(Core::System& system, u32* heap_addr, u32 heap_s
 
 static ResultCode SetMemoryAttribute(Core::System& system, VAddr address, u64 size, u32 mask,
                                      u32 attribute) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_DEBUG(Kernel_SVC,
               "called, address=0x{:016X}, size=0x{:X}, mask=0x{:08X}, attribute=0x{:08X}", address,
               size, mask, attribute);
@@ -226,8 +232,15 @@ static ResultCode SetMemoryAttribute(Core::System& system, VAddr address, u64 si
                                          static_cast<Memory::MemoryAttribute>(attribute));
 }
 
+static ResultCode SetMemoryAttribute32(Core::System& system, u32 address, u32 size, u32 mask,
+                                       u32 attribute) {
+    return SetMemoryAttribute(system, static_cast<VAddr>(address), static_cast<std::size_t>(size),
+                              mask, attribute);
+}
+
 /// Maps a memory range into a different range.
 static ResultCode MapMemory(Core::System& system, VAddr dst_addr, VAddr src_addr, u64 size) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC, "called, dst_addr=0x{:X}, src_addr=0x{:X}, size=0x{:X}", dst_addr,
               src_addr, size);
 
@@ -241,8 +254,14 @@ static ResultCode MapMemory(Core::System& system, VAddr dst_addr, VAddr src_addr
     return page_table.Map(dst_addr, src_addr, size);
 }
 
+static ResultCode MapMemory32(Core::System& system, u32 dst_addr, u32 src_addr, u32 size) {
+    return MapMemory(system, static_cast<VAddr>(dst_addr), static_cast<VAddr>(src_addr),
+                     static_cast<std::size_t>(size));
+}
+
 /// Unmaps a region that was previously mapped with svcMapMemory
 static ResultCode UnmapMemory(Core::System& system, VAddr dst_addr, VAddr src_addr, u64 size) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC, "called, dst_addr=0x{:X}, src_addr=0x{:X}, size=0x{:X}", dst_addr,
               src_addr, size);
 
@@ -256,9 +275,15 @@ static ResultCode UnmapMemory(Core::System& system, VAddr dst_addr, VAddr src_ad
     return page_table.Unmap(dst_addr, src_addr, size);
 }
 
+static ResultCode UnmapMemory32(Core::System& system, u32 dst_addr, u32 src_addr, u32 size) {
+    return UnmapMemory(system, static_cast<VAddr>(dst_addr), static_cast<VAddr>(src_addr),
+                       static_cast<std::size_t>(size));
+}
+
 /// Connect to an OS service given the port name, returns the handle to the port to out
 static ResultCode ConnectToNamedPort(Core::System& system, Handle* out_handle,
                                      VAddr port_name_address) {
+    std::lock_guard lock{HLE::g_hle_lock};
     auto& memory = system.Memory();
 
     if (!memory.IsValidVirtualAddress(port_name_address)) {
@@ -317,11 +342,30 @@ static ResultCode SendSyncRequest(Core::System& system, Handle handle) {
     LOG_TRACE(Kernel_SVC, "called handle=0x{:08X}({})", handle, session->GetName());
 
     auto thread = system.CurrentScheduler().GetCurrentThread();
-    thread->InvalidateWakeupCallback();
-    thread->SetStatus(ThreadStatus::WaitIPC);
-    system.PrepareReschedule(thread->GetProcessorID());
+    {
+        SchedulerLock lock(system.Kernel());
+        thread->InvalidateHLECallback();
+        thread->SetStatus(ThreadStatus::WaitIPC);
+        session->SendSyncRequest(SharedFrom(thread), system.Memory());
+    }
+
+    if (thread->HasHLECallback()) {
+        Handle event_handle = thread->GetHLETimeEvent();
+        if (event_handle != InvalidHandle) {
+            auto& time_manager = system.Kernel().TimeManager();
+            time_manager.UnscheduleTimeEvent(event_handle);
+        }
+
+        {
+            SchedulerLock lock(system.Kernel());
+            auto* sync_object = thread->GetHLESyncObject();
+            sync_object->RemoveWaitingThread(SharedFrom(thread));
+        }
+
+        thread->InvokeHLECallback(SharedFrom(thread));
+    }
 
-    return session->SendSyncRequest(SharedFrom(thread), system.Memory());
+    return thread->GetSignalingResult();
 }
 
 static ResultCode SendSyncRequest32(Core::System& system, Handle handle) {
@@ -383,6 +427,15 @@ static ResultCode GetProcessId(Core::System& system, u64* process_id, Handle han
     return ERR_INVALID_HANDLE;
 }
 
+static ResultCode GetProcessId32(Core::System& system, u32* process_id_low, u32* process_id_high,
+                                 Handle handle) {
+    u64 process_id{};
+    const auto result = GetProcessId(system, &process_id, handle);
+    *process_id_low = static_cast<u32>(process_id);
+    *process_id_high = static_cast<u32>(process_id >> 32);
+    return result;
+}
+
 /// Wait for the given handles to synchronize, timeout after the specified nanoseconds
 static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr handles_address,
                                       u64 handle_count, s64 nano_seconds) {
@@ -447,10 +500,13 @@ static ResultCode CancelSynchronization(Core::System& system, Handle thread_hand
     }
 
     thread->CancelWait();
-    system.PrepareReschedule(thread->GetProcessorID());
     return RESULT_SUCCESS;
 }
 
+static ResultCode CancelSynchronization32(Core::System& system, Handle thread_handle) {
+    return CancelSynchronization(system, thread_handle);
+}
+
 /// Attempts to locks a mutex, creating it if it does not already exist
 static ResultCode ArbitrateLock(Core::System& system, Handle holding_thread_handle,
                                 VAddr mutex_addr, Handle requesting_thread_handle) {
@@ -475,6 +531,12 @@ static ResultCode ArbitrateLock(Core::System& system, Handle holding_thread_hand
                                                   requesting_thread_handle);
 }
 
+static ResultCode ArbitrateLock32(Core::System& system, Handle holding_thread_handle,
+                                  u32 mutex_addr, Handle requesting_thread_handle) {
+    return ArbitrateLock(system, holding_thread_handle, static_cast<VAddr>(mutex_addr),
+                         requesting_thread_handle);
+}
+
 /// Unlock a mutex
 static ResultCode ArbitrateUnlock(Core::System& system, VAddr mutex_addr) {
     LOG_TRACE(Kernel_SVC, "called mutex_addr=0x{:X}", mutex_addr);
@@ -494,6 +556,10 @@ static ResultCode ArbitrateUnlock(Core::System& system, VAddr mutex_addr) {
     return current_process->GetMutex().Release(mutex_addr);
 }
 
+static ResultCode ArbitrateUnlock32(Core::System& system, u32 mutex_addr) {
+    return ArbitrateUnlock(system, static_cast<VAddr>(mutex_addr));
+}
+
 enum class BreakType : u32 {
     Panic = 0,
     AssertionFailed = 1,
@@ -594,6 +660,7 @@ static void Break(Core::System& system, u32 reason, u64 info1, u64 info2) {
         info2, has_dumped_buffer ? std::make_optional(debug_buffer) : std::nullopt);
 
     if (!break_reason.signal_debugger) {
+        SchedulerLock lock(system.Kernel());
         LOG_CRITICAL(
             Debug_Emulated,
             "Emulated program broke execution! reason=0x{:016X}, info1=0x{:016X}, info2=0x{:016X}",
@@ -605,14 +672,16 @@ static void Break(Core::System& system, u32 reason, u64 info1, u64 info2) {
         const auto thread_processor_id = current_thread->GetProcessorID();
         system.ArmInterface(static_cast<std::size_t>(thread_processor_id)).LogBacktrace();
 
-        system.Kernel().CurrentProcess()->PrepareForTermination();
-
         // Kill the current thread
+        system.Kernel().ExceptionalExit();
         current_thread->Stop();
-        system.PrepareReschedule();
     }
 }
 
+static void Break32(Core::System& system, u32 reason, u32 info1, u32 info2) {
+    Break(system, reason, static_cast<u64>(info1), static_cast<u64>(info2));
+}
+
 /// Used to output a message on a debug hardware unit - does nothing on a retail unit
 static void OutputDebugString([[maybe_unused]] Core::System& system, VAddr address, u64 len) {
     if (len == 0) {
@@ -627,6 +696,7 @@ static void OutputDebugString([[maybe_unused]] Core::System& system, VAddr addre
 /// Gets system/memory information for the current process
 static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 handle,
                           u64 info_sub_id) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC, "called info_id=0x{:X}, info_sub_id=0x{:X}, handle=0x{:08X}", info_id,
               info_sub_id, handle);
 
@@ -863,9 +933,9 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
         if (same_thread && info_sub_id == 0xFFFFFFFFFFFFFFFF) {
             const u64 thread_ticks = current_thread->GetTotalCPUTimeTicks();
 
-            out_ticks = thread_ticks + (core_timing.GetTicks() - prev_ctx_ticks);
+            out_ticks = thread_ticks + (core_timing.GetCPUTicks() - prev_ctx_ticks);
         } else if (same_thread && info_sub_id == system.CurrentCoreIndex()) {
-            out_ticks = core_timing.GetTicks() - prev_ctx_ticks;
+            out_ticks = core_timing.GetCPUTicks() - prev_ctx_ticks;
         }
 
         *result = out_ticks;
@@ -892,6 +962,7 @@ static ResultCode GetInfo32(Core::System& system, u32* result_low, u32* result_h
 
 /// Maps memory at a desired address
 static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
 
     if (!Common::Is4KBAligned(addr)) {
@@ -939,8 +1010,13 @@ static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size)
     return page_table.MapPhysicalMemory(addr, size);
 }
 
+static ResultCode MapPhysicalMemory32(Core::System& system, u32 addr, u32 size) {
+    return MapPhysicalMemory(system, static_cast<VAddr>(addr), static_cast<std::size_t>(size));
+}
+
 /// Unmaps memory previously mapped via MapPhysicalMemory
 static ResultCode UnmapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
 
     if (!Common::Is4KBAligned(addr)) {
@@ -988,6 +1064,10 @@ static ResultCode UnmapPhysicalMemory(Core::System& system, VAddr addr, u64 size
     return page_table.UnmapPhysicalMemory(addr, size);
 }
 
+static ResultCode UnmapPhysicalMemory32(Core::System& system, u32 addr, u32 size) {
+    return UnmapPhysicalMemory(system, static_cast<VAddr>(addr), static_cast<std::size_t>(size));
+}
+
 /// Sets the thread activity
 static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 activity) {
     LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, activity=0x{:08X}", handle, activity);
@@ -1017,10 +1097,11 @@ static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 act
         return ERR_BUSY;
     }
 
-    thread->SetActivity(static_cast<ThreadActivity>(activity));
+    return thread->SetActivity(static_cast<ThreadActivity>(activity));
+}
 
-    system.PrepareReschedule(thread->GetProcessorID());
-    return RESULT_SUCCESS;
+static ResultCode SetThreadActivity32(Core::System& system, Handle handle, u32 activity) {
+    return SetThreadActivity(system, handle, activity);
 }
 
 /// Gets the thread context
@@ -1064,6 +1145,10 @@ static ResultCode GetThreadContext(Core::System& system, VAddr thread_context, H
     return RESULT_SUCCESS;
 }
 
+static ResultCode GetThreadContext32(Core::System& system, u32 thread_context, Handle handle) {
+    return GetThreadContext(system, static_cast<VAddr>(thread_context), handle);
+}
+
 /// Gets the priority for the specified thread
 static ResultCode GetThreadPriority(Core::System& system, u32* priority, Handle handle) {
     LOG_TRACE(Kernel_SVC, "called");
@@ -1071,6 +1156,7 @@ static ResultCode GetThreadPriority(Core::System& system, u32* priority, Handle
     const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
     const std::shared_ptr<Thread> thread = handle_table.Get<Thread>(handle);
     if (!thread) {
+        *priority = 0;
         LOG_ERROR(Kernel_SVC, "Thread handle does not exist, handle=0x{:08X}", handle);
         return ERR_INVALID_HANDLE;
     }
@@ -1105,18 +1191,26 @@ static ResultCode SetThreadPriority(Core::System& system, Handle handle, u32 pri
 
     thread->SetPriority(priority);
 
-    system.PrepareReschedule(thread->GetProcessorID());
     return RESULT_SUCCESS;
 }
 
+static ResultCode SetThreadPriority32(Core::System& system, Handle handle, u32 priority) {
+    return SetThreadPriority(system, handle, priority);
+}
+
 /// Get which CPU core is executing the current thread
 static u32 GetCurrentProcessorNumber(Core::System& system) {
     LOG_TRACE(Kernel_SVC, "called");
-    return system.CurrentScheduler().GetCurrentThread()->GetProcessorID();
+    return static_cast<u32>(system.CurrentPhysicalCore().CoreIndex());
+}
+
+static u32 GetCurrentProcessorNumber32(Core::System& system) {
+    return GetCurrentProcessorNumber(system);
 }
 
 static ResultCode MapSharedMemory(Core::System& system, Handle shared_memory_handle, VAddr addr,
                                   u64 size, u32 permissions) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC,
               "called, shared_memory_handle=0x{:X}, addr=0x{:X}, size=0x{:X}, permissions=0x{:08X}",
               shared_memory_handle, addr, size, permissions);
@@ -1187,9 +1281,16 @@ static ResultCode MapSharedMemory(Core::System& system, Handle shared_memory_han
     return shared_memory->Map(*current_process, addr, size, permission_type);
 }
 
+static ResultCode MapSharedMemory32(Core::System& system, Handle shared_memory_handle, u32 addr,
+                                    u32 size, u32 permissions) {
+    return MapSharedMemory(system, shared_memory_handle, static_cast<VAddr>(addr),
+                           static_cast<std::size_t>(size), permissions);
+}
+
 static ResultCode QueryProcessMemory(Core::System& system, VAddr memory_info_address,
                                      VAddr page_info_address, Handle process_handle,
                                      VAddr address) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_TRACE(Kernel_SVC, "called process=0x{:08X} address={:X}", process_handle, address);
     const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
     std::shared_ptr<Process> process = handle_table.Get<Process>(process_handle);
@@ -1372,6 +1473,7 @@ static ResultCode UnmapProcessCodeMemory(Core::System& system, Handle process_ha
 /// Exits the current process
 static void ExitProcess(Core::System& system) {
     auto* current_process = system.Kernel().CurrentProcess();
+    UNIMPLEMENTED();
 
     LOG_INFO(Kernel_SVC, "Process {} exiting", current_process->GetProcessID());
     ASSERT_MSG(current_process->GetStatus() == ProcessStatus::Running,
@@ -1381,8 +1483,10 @@ static void ExitProcess(Core::System& system) {
 
     // Kill the current thread
     system.CurrentScheduler().GetCurrentThread()->Stop();
+}
 
-    system.PrepareReschedule();
+static void ExitProcess32(Core::System& system) {
+    ExitProcess(system);
 }
 
 /// Creates a new thread
@@ -1428,9 +1532,10 @@ static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr e
 
     ASSERT(kernel.CurrentProcess()->GetResourceLimit()->Reserve(ResourceType::Threads, 1));
 
+    ThreadType type = THREADTYPE_USER;
     CASCADE_RESULT(std::shared_ptr<Thread> thread,
-                   Thread::Create(kernel, "", entry_point, priority, arg, processor_id, stack_top,
-                                  *current_process));
+                   Thread::Create(system, type, "", entry_point, priority, arg, processor_id,
+                                  stack_top, current_process));
 
     const auto new_thread_handle = current_process->GetHandleTable().Create(thread);
     if (new_thread_handle.Failed()) {
@@ -1444,11 +1549,15 @@ static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr e
     thread->SetName(
         fmt::format("thread[entry_point={:X}, handle={:X}]", entry_point, *new_thread_handle));
 
-    system.PrepareReschedule(thread->GetProcessorID());
-
     return RESULT_SUCCESS;
 }
 
+static ResultCode CreateThread32(Core::System& system, Handle* out_handle, u32 priority,
+                                 u32 entry_point, u32 arg, u32 stack_top, s32 processor_id) {
+    return CreateThread(system, out_handle, static_cast<VAddr>(entry_point), static_cast<u64>(arg),
+                        static_cast<VAddr>(stack_top), priority, processor_id);
+}
+
 /// Starts the thread for the provided handle
 static ResultCode StartThread(Core::System& system, Handle thread_handle) {
     LOG_DEBUG(Kernel_SVC, "called thread=0x{:08X}", thread_handle);
@@ -1463,13 +1572,11 @@ static ResultCode StartThread(Core::System& system, Handle thread_handle) {
 
     ASSERT(thread->GetStatus() == ThreadStatus::Dormant);
 
-    thread->ResumeFromWait();
-
-    if (thread->GetStatus() == ThreadStatus::Ready) {
-        system.PrepareReschedule(thread->GetProcessorID());
-    }
+    return thread->Start();
+}
 
-    return RESULT_SUCCESS;
+static ResultCode StartThread32(Core::System& system, Handle thread_handle) {
+    return StartThread(system, thread_handle);
 }
 
 /// Called when a thread exits
@@ -1477,9 +1584,12 @@ static void ExitThread(Core::System& system) {
     LOG_DEBUG(Kernel_SVC, "called, pc=0x{:08X}", system.CurrentArmInterface().GetPC());
 
     auto* const current_thread = system.CurrentScheduler().GetCurrentThread();
-    current_thread->Stop();
     system.GlobalScheduler().RemoveThread(SharedFrom(current_thread));
-    system.PrepareReschedule();
+    current_thread->Stop();
+}
+
+static void ExitThread32(Core::System& system) {
+    ExitThread(system);
 }
 
 /// Sleep the current thread
@@ -1498,15 +1608,21 @@ static void SleepThread(Core::System& system, s64 nanoseconds) {
 
     if (nanoseconds <= 0) {
         switch (static_cast<SleepType>(nanoseconds)) {
-        case SleepType::YieldWithoutLoadBalancing:
-            is_redundant = current_thread->YieldSimple();
+        case SleepType::YieldWithoutLoadBalancing: {
+            auto pair = current_thread->YieldSimple();
+            is_redundant = pair.second;
             break;
-        case SleepType::YieldWithLoadBalancing:
-            is_redundant = current_thread->YieldAndBalanceLoad();
+        }
+        case SleepType::YieldWithLoadBalancing: {
+            auto pair = current_thread->YieldAndBalanceLoad();
+            is_redundant = pair.second;
             break;
-        case SleepType::YieldAndWaitForLoadBalancing:
-            is_redundant = current_thread->YieldAndWaitForLoadBalancing();
+        }
+        case SleepType::YieldAndWaitForLoadBalancing: {
+            auto pair = current_thread->YieldAndWaitForLoadBalancing();
+            is_redundant = pair.second;
             break;
+        }
         default:
             UNREACHABLE_MSG("Unimplemented sleep yield type '{:016X}'!", nanoseconds);
         }
@@ -1514,13 +1630,18 @@ static void SleepThread(Core::System& system, s64 nanoseconds) {
         current_thread->Sleep(nanoseconds);
     }
 
-    if (is_redundant) {
-        // If it's redundant, the core is pretty much idle. Some games keep idling
-        // a core while it's doing nothing, we advance timing to avoid costly continuous
-        // calls.
-        system.CoreTiming().AddTicks(2000);
+    if (is_redundant && !system.Kernel().IsMulticore()) {
+        system.Kernel().ExitSVCProfile();
+        system.CoreTiming().AddTicks(1000U);
+        system.GetCpuManager().PreemptSingleCore();
+        system.Kernel().EnterSVCProfile();
     }
-    system.PrepareReschedule(current_thread->GetProcessorID());
+}
+
+static void SleepThread32(Core::System& system, u32 nanoseconds_low, u32 nanoseconds_high) {
+    const s64 nanoseconds = static_cast<s64>(static_cast<u64>(nanoseconds_low) |
+                                             (static_cast<u64>(nanoseconds_high) << 32));
+    SleepThread(system, nanoseconds);
 }
 
 /// Wait process wide key atomic
@@ -1547,31 +1668,69 @@ static ResultCode WaitProcessWideKeyAtomic(Core::System& system, VAddr mutex_add
     }
 
     ASSERT(condition_variable_addr == Common::AlignDown(condition_variable_addr, 4));
-
+    auto& kernel = system.Kernel();
+    Handle event_handle;
+    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
     auto* const current_process = system.Kernel().CurrentProcess();
-    const auto& handle_table = current_process->GetHandleTable();
-    std::shared_ptr<Thread> thread = handle_table.Get<Thread>(thread_handle);
-    ASSERT(thread);
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, current_thread, nano_seconds);
+        const auto& handle_table = current_process->GetHandleTable();
+        std::shared_ptr<Thread> thread = handle_table.Get<Thread>(thread_handle);
+        ASSERT(thread);
+
+        current_thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+
+        if (thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return ERR_THREAD_TERMINATING;
+        }
+
+        const auto release_result = current_process->GetMutex().Release(mutex_addr);
+        if (release_result.IsError()) {
+            lock.CancelSleep();
+            return release_result;
+        }
+
+        if (nano_seconds == 0) {
+            lock.CancelSleep();
+            return RESULT_TIMEOUT;
+        }
 
-    const auto release_result = current_process->GetMutex().Release(mutex_addr);
-    if (release_result.IsError()) {
-        return release_result;
+        current_thread->SetCondVarWaitAddress(condition_variable_addr);
+        current_thread->SetMutexWaitAddress(mutex_addr);
+        current_thread->SetWaitHandle(thread_handle);
+        current_thread->SetStatus(ThreadStatus::WaitCondVar);
+        current_process->InsertConditionVariableThread(SharedFrom(current_thread));
     }
 
-    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
-    current_thread->SetCondVarWaitAddress(condition_variable_addr);
-    current_thread->SetMutexWaitAddress(mutex_addr);
-    current_thread->SetWaitHandle(thread_handle);
-    current_thread->SetStatus(ThreadStatus::WaitCondVar);
-    current_thread->InvalidateWakeupCallback();
-    current_process->InsertConditionVariableThread(SharedFrom(current_thread));
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
+    }
+
+    {
+        SchedulerLock lock(kernel);
 
-    current_thread->WakeAfterDelay(nano_seconds);
+        auto* owner = current_thread->GetLockOwner();
+        if (owner != nullptr) {
+            owner->RemoveMutexWaiter(SharedFrom(current_thread));
+        }
 
+        current_process->RemoveConditionVariableThread(SharedFrom(current_thread));
+    }
     // Note: Deliberately don't attempt to inherit the lock owner's priority.
 
-    system.PrepareReschedule(current_thread->GetProcessorID());
-    return RESULT_SUCCESS;
+    return current_thread->GetSignalingResult();
+}
+
+static ResultCode WaitProcessWideKeyAtomic32(Core::System& system, u32 mutex_addr,
+                                             u32 condition_variable_addr, Handle thread_handle,
+                                             u32 nanoseconds_low, u32 nanoseconds_high) {
+    const s64 nanoseconds =
+        static_cast<s64>(nanoseconds_low | (static_cast<u64>(nanoseconds_high) << 32));
+    return WaitProcessWideKeyAtomic(system, static_cast<VAddr>(mutex_addr),
+                                    static_cast<VAddr>(condition_variable_addr), thread_handle,
+                                    nanoseconds);
 }
 
 /// Signal process wide key
@@ -1582,7 +1741,9 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
     ASSERT(condition_variable_addr == Common::AlignDown(condition_variable_addr, 4));
 
     // Retrieve a list of all threads that are waiting for this condition variable.
-    auto* const current_process = system.Kernel().CurrentProcess();
+    auto& kernel = system.Kernel();
+    SchedulerLock lock(kernel);
+    auto* const current_process = kernel.CurrentProcess();
     std::vector<std::shared_ptr<Thread>> waiting_threads =
         current_process->GetConditionVariableThreads(condition_variable_addr);
 
@@ -1591,7 +1752,7 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
     std::size_t last = waiting_threads.size();
     if (target > 0)
         last = std::min(waiting_threads.size(), static_cast<std::size_t>(target));
-
+    auto& time_manager = kernel.TimeManager();
     for (std::size_t index = 0; index < last; ++index) {
         auto& thread = waiting_threads[index];
 
@@ -1599,7 +1760,6 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
 
         // liberate Cond Var Thread.
         current_process->RemoveConditionVariableThread(thread);
-        thread->SetCondVarWaitAddress(0);
 
         const std::size_t current_core = system.CurrentCoreIndex();
         auto& monitor = system.Monitor();
@@ -1610,10 +1770,8 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
         u32 update_val = 0;
         const VAddr mutex_address = thread->GetMutexWaitAddress();
         do {
-            monitor.SetExclusive(current_core, mutex_address);
-
             // If the mutex is not yet acquired, acquire it.
-            mutex_val = memory.Read32(mutex_address);
+            mutex_val = monitor.ExclusiveRead32(current_core, mutex_address);
 
             if (mutex_val != 0) {
                 update_val = mutex_val | Mutex::MutexHasWaitersFlag;
@@ -1621,33 +1779,28 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
                 update_val = thread->GetWaitHandle();
             }
         } while (!monitor.ExclusiveWrite32(current_core, mutex_address, update_val));
+        monitor.ClearExclusive();
         if (mutex_val == 0) {
             // We were able to acquire the mutex, resume this thread.
-            ASSERT(thread->GetStatus() == ThreadStatus::WaitCondVar);
-            thread->ResumeFromWait();
-
             auto* const lock_owner = thread->GetLockOwner();
             if (lock_owner != nullptr) {
                 lock_owner->RemoveMutexWaiter(thread);
             }
 
             thread->SetLockOwner(nullptr);
-            thread->SetMutexWaitAddress(0);
-            thread->SetWaitHandle(0);
-            thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
-            system.PrepareReschedule(thread->GetProcessorID());
+            thread->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
+            thread->ResumeFromWait();
         } else {
             // The mutex is already owned by some other thread, make this thread wait on it.
             const Handle owner_handle = static_cast<Handle>(mutex_val & Mutex::MutexOwnerMask);
             const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
             auto owner = handle_table.Get<Thread>(owner_handle);
             ASSERT(owner);
-            ASSERT(thread->GetStatus() == ThreadStatus::WaitCondVar);
-            thread->InvalidateWakeupCallback();
-            thread->SetStatus(ThreadStatus::WaitMutex);
+            if (thread->GetStatus() == ThreadStatus::WaitCondVar) {
+                thread->SetStatus(ThreadStatus::WaitMutex);
+            }
 
             owner->AddMutexWaiter(thread);
-            system.PrepareReschedule(thread->GetProcessorID());
         }
     }
 }
@@ -1678,12 +1831,15 @@ static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type,
     auto& address_arbiter = system.Kernel().CurrentProcess()->GetAddressArbiter();
     const ResultCode result =
         address_arbiter.WaitForAddress(address, arbitration_type, value, timeout);
-    if (result == RESULT_SUCCESS) {
-        system.PrepareReschedule();
-    }
     return result;
 }
 
+static ResultCode WaitForAddress32(Core::System& system, u32 address, u32 type, s32 value,
+                                   u32 timeout_low, u32 timeout_high) {
+    s64 timeout = static_cast<s64>(timeout_low | (static_cast<u64>(timeout_high) << 32));
+    return WaitForAddress(system, static_cast<VAddr>(address), type, value, timeout);
+}
+
 // Signals to an address (via Address Arbiter)
 static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                   s32 num_to_wake) {
@@ -1707,6 +1863,11 @@ static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type,
     return address_arbiter.SignalToAddress(address, signal_type, value, num_to_wake);
 }
 
+static ResultCode SignalToAddress32(Core::System& system, u32 address, u32 type, s32 value,
+                                    s32 num_to_wake) {
+    return SignalToAddress(system, static_cast<VAddr>(address), type, value, num_to_wake);
+}
+
 static void KernelDebug([[maybe_unused]] Core::System& system,
                         [[maybe_unused]] u32 kernel_debug_type, [[maybe_unused]] u64 param1,
                         [[maybe_unused]] u64 param2, [[maybe_unused]] u64 param3) {
@@ -1725,14 +1886,21 @@ static u64 GetSystemTick(Core::System& system) {
     auto& core_timing = system.CoreTiming();
 
     // Returns the value of cntpct_el0 (https://switchbrew.org/wiki/SVC#svcGetSystemTick)
-    const u64 result{Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks())};
+    const u64 result{system.CoreTiming().GetClockTicks()};
 
-    // Advance time to defeat dumb games that busy-wait for the frame to end.
-    core_timing.AddTicks(400);
+    if (!system.Kernel().IsMulticore()) {
+        core_timing.AddTicks(400U);
+    }
 
     return result;
 }
 
+static void GetSystemTick32(Core::System& system, u32* time_low, u32* time_high) {
+    u64 time = GetSystemTick(system);
+    *time_low = static_cast<u32>(time);
+    *time_high = static_cast<u32>(time >> 32);
+}
+
 /// Close a handle
 static ResultCode CloseHandle(Core::System& system, Handle handle) {
     LOG_TRACE(Kernel_SVC, "Closing handle 0x{:08X}", handle);
@@ -1765,9 +1933,14 @@ static ResultCode ResetSignal(Core::System& system, Handle handle) {
     return ERR_INVALID_HANDLE;
 }
 
+static ResultCode ResetSignal32(Core::System& system, Handle handle) {
+    return ResetSignal(system, handle);
+}
+
 /// Creates a TransferMemory object
 static ResultCode CreateTransferMemory(Core::System& system, Handle* handle, VAddr addr, u64 size,
                                        u32 permissions) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_DEBUG(Kernel_SVC, "called addr=0x{:X}, size=0x{:X}, perms=0x{:08X}", addr, size,
               permissions);
 
@@ -1812,6 +1985,12 @@ static ResultCode CreateTransferMemory(Core::System& system, Handle* handle, VAd
     return RESULT_SUCCESS;
 }
 
+static ResultCode CreateTransferMemory32(Core::System& system, Handle* handle, u32 addr, u32 size,
+                                         u32 permissions) {
+    return CreateTransferMemory(system, handle, static_cast<VAddr>(addr),
+                                static_cast<std::size_t>(size), permissions);
+}
+
 static ResultCode GetThreadCoreMask(Core::System& system, Handle thread_handle, u32* core,
                                     u64* mask) {
     LOG_TRACE(Kernel_SVC, "called, handle=0x{:08X}", thread_handle);
@@ -1821,6 +2000,8 @@ static ResultCode GetThreadCoreMask(Core::System& system, Handle thread_handle,
     if (!thread) {
         LOG_ERROR(Kernel_SVC, "Thread handle does not exist, thread_handle=0x{:08X}",
                   thread_handle);
+        *core = 0;
+        *mask = 0;
         return ERR_INVALID_HANDLE;
     }
 
@@ -1830,6 +2011,15 @@ static ResultCode GetThreadCoreMask(Core::System& system, Handle thread_handle,
     return RESULT_SUCCESS;
 }
 
+static ResultCode GetThreadCoreMask32(Core::System& system, Handle thread_handle, u32* core,
+                                      u32* mask_low, u32* mask_high) {
+    u64 mask{};
+    const auto result = GetThreadCoreMask(system, thread_handle, core, &mask);
+    *mask_high = static_cast<u32>(mask >> 32);
+    *mask_low = static_cast<u32>(mask);
+    return result;
+}
+
 static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle, u32 core,
                                     u64 affinity_mask) {
     LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, core=0x{:X}, affinity_mask=0x{:016X}",
@@ -1861,7 +2051,7 @@ static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle,
             return ERR_INVALID_COMBINATION;
         }
 
-        if (core < Core::NUM_CPU_CORES) {
+        if (core < Core::Hardware::NUM_CPU_CORES) {
             if ((affinity_mask & (1ULL << core)) == 0) {
                 LOG_ERROR(Kernel_SVC,
                           "Core is not enabled for the current mask, core={}, mask={:016X}", core,
@@ -1883,11 +2073,14 @@ static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle,
         return ERR_INVALID_HANDLE;
     }
 
-    system.PrepareReschedule(thread->GetProcessorID());
-    thread->ChangeCore(core, affinity_mask);
-    system.PrepareReschedule(thread->GetProcessorID());
+    return thread->SetCoreAndAffinityMask(core, affinity_mask);
+}
 
-    return RESULT_SUCCESS;
+static ResultCode SetThreadCoreMask32(Core::System& system, Handle thread_handle, u32 core,
+                                      u32 affinity_mask_low, u32 affinity_mask_high) {
+    const u64 affinity_mask =
+        static_cast<u64>(affinity_mask_low) | (static_cast<u64>(affinity_mask_high) << 32);
+    return SetThreadCoreMask(system, thread_handle, core, affinity_mask);
 }
 
 static ResultCode CreateEvent(Core::System& system, Handle* write_handle, Handle* read_handle) {
@@ -1918,6 +2111,10 @@ static ResultCode CreateEvent(Core::System& system, Handle* write_handle, Handle
     return RESULT_SUCCESS;
 }
 
+static ResultCode CreateEvent32(Core::System& system, Handle* write_handle, Handle* read_handle) {
+    return CreateEvent(system, write_handle, read_handle);
+}
+
 static ResultCode ClearEvent(Core::System& system, Handle handle) {
     LOG_TRACE(Kernel_SVC, "called, event=0x{:08X}", handle);
 
@@ -1939,6 +2136,10 @@ static ResultCode ClearEvent(Core::System& system, Handle handle) {
     return ERR_INVALID_HANDLE;
 }
 
+static ResultCode ClearEvent32(Core::System& system, Handle handle) {
+    return ClearEvent(system, handle);
+}
+
 static ResultCode SignalEvent(Core::System& system, Handle handle) {
     LOG_DEBUG(Kernel_SVC, "called. Handle=0x{:08X}", handle);
 
@@ -1951,10 +2152,13 @@ static ResultCode SignalEvent(Core::System& system, Handle handle) {
     }
 
     writable_event->Signal();
-    system.PrepareReschedule();
     return RESULT_SUCCESS;
 }
 
+static ResultCode SignalEvent32(Core::System& system, Handle handle) {
+    return SignalEvent(system, handle);
+}
+
 static ResultCode GetProcessInfo(Core::System& system, u64* out, Handle process_handle, u32 type) {
     LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, type=0x{:X}", process_handle, type);
 
@@ -1982,6 +2186,7 @@ static ResultCode GetProcessInfo(Core::System& system, u64* out, Handle process_
 }
 
 static ResultCode CreateResourceLimit(Core::System& system, Handle* out_handle) {
+    std::lock_guard lock{HLE::g_hle_lock};
     LOG_DEBUG(Kernel_SVC, "called");
 
     auto& kernel = system.Kernel();
@@ -2139,6 +2344,15 @@ static ResultCode GetThreadList(Core::System& system, u32* out_num_threads, VAdd
     return RESULT_SUCCESS;
 }
 
+static ResultCode FlushProcessDataCache32(Core::System& system, Handle handle, u32 address,
+                                          u32 size) {
+    // Note(Blinkhawk): For emulation purposes of the data cache this is mostly a nope
+    // as all emulation is done in the same cache level in host architecture, thus data cache
+    // does not need flushing.
+    LOG_DEBUG(Kernel_SVC, "called");
+    return RESULT_SUCCESS;
+}
+
 namespace {
 struct FunctionDef {
     using Func = void(Core::System&);
@@ -2153,57 +2367,57 @@ static const FunctionDef SVC_Table_32[] = {
     {0x00, nullptr, "Unknown"},
     {0x01, SvcWrap32<SetHeapSize32>, "SetHeapSize32"},
     {0x02, nullptr, "Unknown"},
-    {0x03, nullptr, "SetMemoryAttribute32"},
-    {0x04, nullptr, "MapMemory32"},
-    {0x05, nullptr, "UnmapMemory32"},
+    {0x03, SvcWrap32<SetMemoryAttribute32>, "SetMemoryAttribute32"},
+    {0x04, SvcWrap32<MapMemory32>, "MapMemory32"},
+    {0x05, SvcWrap32<UnmapMemory32>, "UnmapMemory32"},
     {0x06, SvcWrap32<QueryMemory32>, "QueryMemory32"},
-    {0x07, nullptr, "ExitProcess32"},
-    {0x08, nullptr, "CreateThread32"},
-    {0x09, nullptr, "StartThread32"},
-    {0x0a, nullptr, "ExitThread32"},
-    {0x0b, nullptr, "SleepThread32"},
+    {0x07, SvcWrap32<ExitProcess32>, "ExitProcess32"},
+    {0x08, SvcWrap32<CreateThread32>, "CreateThread32"},
+    {0x09, SvcWrap32<StartThread32>, "StartThread32"},
+    {0x0a, SvcWrap32<ExitThread32>, "ExitThread32"},
+    {0x0b, SvcWrap32<SleepThread32>, "SleepThread32"},
     {0x0c, SvcWrap32<GetThreadPriority32>, "GetThreadPriority32"},
-    {0x0d, nullptr, "SetThreadPriority32"},
-    {0x0e, nullptr, "GetThreadCoreMask32"},
-    {0x0f, nullptr, "SetThreadCoreMask32"},
-    {0x10, nullptr, "GetCurrentProcessorNumber32"},
-    {0x11, nullptr, "SignalEvent32"},
-    {0x12, nullptr, "ClearEvent32"},
-    {0x13, nullptr, "MapSharedMemory32"},
+    {0x0d, SvcWrap32<SetThreadPriority32>, "SetThreadPriority32"},
+    {0x0e, SvcWrap32<GetThreadCoreMask32>, "GetThreadCoreMask32"},
+    {0x0f, SvcWrap32<SetThreadCoreMask32>, "SetThreadCoreMask32"},
+    {0x10, SvcWrap32<GetCurrentProcessorNumber32>, "GetCurrentProcessorNumber32"},
+    {0x11, SvcWrap32<SignalEvent32>, "SignalEvent32"},
+    {0x12, SvcWrap32<ClearEvent32>, "ClearEvent32"},
+    {0x13, SvcWrap32<MapSharedMemory32>, "MapSharedMemory32"},
     {0x14, nullptr, "UnmapSharedMemory32"},
-    {0x15, nullptr, "CreateTransferMemory32"},
+    {0x15, SvcWrap32<CreateTransferMemory32>, "CreateTransferMemory32"},
     {0x16, SvcWrap32<CloseHandle32>, "CloseHandle32"},
-    {0x17, nullptr, "ResetSignal32"},
+    {0x17, SvcWrap32<ResetSignal32>, "ResetSignal32"},
     {0x18, SvcWrap32<WaitSynchronization32>, "WaitSynchronization32"},
-    {0x19, nullptr, "CancelSynchronization32"},
-    {0x1a, nullptr, "ArbitrateLock32"},
-    {0x1b, nullptr, "ArbitrateUnlock32"},
-    {0x1c, nullptr, "WaitProcessWideKeyAtomic32"},
+    {0x19, SvcWrap32<CancelSynchronization32>, "CancelSynchronization32"},
+    {0x1a, SvcWrap32<ArbitrateLock32>, "ArbitrateLock32"},
+    {0x1b, SvcWrap32<ArbitrateUnlock32>, "ArbitrateUnlock32"},
+    {0x1c, SvcWrap32<WaitProcessWideKeyAtomic32>, "WaitProcessWideKeyAtomic32"},
     {0x1d, SvcWrap32<SignalProcessWideKey32>, "SignalProcessWideKey32"},
-    {0x1e, nullptr, "GetSystemTick32"},
+    {0x1e, SvcWrap32<GetSystemTick32>, "GetSystemTick32"},
     {0x1f, SvcWrap32<ConnectToNamedPort32>, "ConnectToNamedPort32"},
     {0x20, nullptr, "Unknown"},
     {0x21, SvcWrap32<SendSyncRequest32>, "SendSyncRequest32"},
     {0x22, nullptr, "SendSyncRequestWithUserBuffer32"},
     {0x23, nullptr, "Unknown"},
-    {0x24, nullptr, "GetProcessId32"},
+    {0x24, SvcWrap32<GetProcessId32>, "GetProcessId32"},
     {0x25, SvcWrap32<GetThreadId32>, "GetThreadId32"},
-    {0x26, nullptr, "Break32"},
+    {0x26, SvcWrap32<Break32>, "Break32"},
     {0x27, nullptr, "OutputDebugString32"},
     {0x28, nullptr, "Unknown"},
     {0x29, SvcWrap32<GetInfo32>, "GetInfo32"},
     {0x2a, nullptr, "Unknown"},
     {0x2b, nullptr, "Unknown"},
-    {0x2c, nullptr, "MapPhysicalMemory32"},
-    {0x2d, nullptr, "UnmapPhysicalMemory32"},
+    {0x2c, SvcWrap32<MapPhysicalMemory32>, "MapPhysicalMemory32"},
+    {0x2d, SvcWrap32<UnmapPhysicalMemory32>, "UnmapPhysicalMemory32"},
     {0x2e, nullptr, "Unknown"},
     {0x2f, nullptr, "Unknown"},
     {0x30, nullptr, "Unknown"},
     {0x31, nullptr, "Unknown"},
-    {0x32, nullptr, "SetThreadActivity32"},
-    {0x33, nullptr, "GetThreadContext32"},
-    {0x34, nullptr, "WaitForAddress32"},
-    {0x35, nullptr, "SignalToAddress32"},
+    {0x32, SvcWrap32<SetThreadActivity32>, "SetThreadActivity32"},
+    {0x33, SvcWrap32<GetThreadContext32>, "GetThreadContext32"},
+    {0x34, SvcWrap32<WaitForAddress32>, "WaitForAddress32"},
+    {0x35, SvcWrap32<SignalToAddress32>, "SignalToAddress32"},
     {0x36, nullptr, "Unknown"},
     {0x37, nullptr, "Unknown"},
     {0x38, nullptr, "Unknown"},
@@ -2219,7 +2433,7 @@ static const FunctionDef SVC_Table_32[] = {
     {0x42, nullptr, "Unknown"},
     {0x43, nullptr, "ReplyAndReceive32"},
     {0x44, nullptr, "Unknown"},
-    {0x45, nullptr, "CreateEvent32"},
+    {0x45, SvcWrap32<CreateEvent32>, "CreateEvent32"},
     {0x46, nullptr, "Unknown"},
     {0x47, nullptr, "Unknown"},
     {0x48, nullptr, "Unknown"},
@@ -2245,7 +2459,7 @@ static const FunctionDef SVC_Table_32[] = {
     {0x5c, nullptr, "Unknown"},
     {0x5d, nullptr, "Unknown"},
     {0x5e, nullptr, "Unknown"},
-    {0x5F, nullptr, "FlushProcessDataCache32"},
+    {0x5F, SvcWrap32<FlushProcessDataCache32>, "FlushProcessDataCache32"},
     {0x60, nullptr, "Unknown"},
     {0x61, nullptr, "Unknown"},
     {0x62, nullptr, "Unknown"},
@@ -2423,13 +2637,10 @@ static const FunctionDef* GetSVCInfo64(u32 func_num) {
     return &SVC_Table_64[func_num];
 }
 
-MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
-
 void Call(Core::System& system, u32 immediate) {
-    MICROPROFILE_SCOPE(Kernel_SVC);
-
-    // Lock the global kernel mutex when we enter the kernel HLE.
-    std::lock_guard lock{HLE::g_hle_lock};
+    system.ExitDynarmicProfile();
+    auto& kernel = system.Kernel();
+    kernel.EnterSVCProfile();
 
     const FunctionDef* info = system.CurrentProcess()->Is64BitProcess() ? GetSVCInfo64(immediate)
                                                                         : GetSVCInfo32(immediate);
@@ -2442,6 +2653,9 @@ void Call(Core::System& system, u32 immediate) {
     } else {
         LOG_CRITICAL(Kernel_SVC, "Unknown SVC function 0x{:X}", immediate);
     }
+
+    kernel.ExitSVCProfile();
+    system.EnterDynarmicProfile();
 }
 
 } // namespace Kernel::Svc
diff --git a/src/core/hle/kernel/svc_wrap.h b/src/core/hle/kernel/svc_wrap.h
index 7d735e3fa..0b6dd9df0 100644
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -350,13 +350,50 @@ void SvcWrap64(Core::System& system) {
     func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2));
 }
 
-// Used by QueryMemory32
+// Used by QueryMemory32, ArbitrateLock32
 template <ResultCode func(Core::System&, u32, u32, u32)>
 void SvcWrap32(Core::System& system) {
     FuncReturn32(system,
                  func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2)).raw);
 }
 
+// Used by Break32
+template <void func(Core::System&, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2));
+}
+
+// Used by ExitProcess32, ExitThread32
+template <void func(Core::System&)>
+void SvcWrap32(Core::System& system) {
+    func(system);
+}
+
+// Used by GetCurrentProcessorNumber32
+template <u32 func(Core::System&)>
+void SvcWrap32(Core::System& system) {
+    FuncReturn32(system, func(system));
+}
+
+// Used by SleepThread32
+template <void func(Core::System&, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    func(system, Param32(system, 0), Param32(system, 1));
+}
+
+// Used by CreateThread32
+template <ResultCode func(Core::System&, Handle*, u32, u32, u32, u32, s32)>
+void SvcWrap32(Core::System& system) {
+    Handle param_1 = 0;
+
+    const u32 retval = func(system, &param_1, Param32(system, 0), Param32(system, 1),
+                            Param32(system, 2), Param32(system, 3), Param32(system, 4))
+                           .raw;
+
+    system.CurrentArmInterface().SetReg(1, param_1);
+    FuncReturn(system, retval);
+}
+
 // Used by GetInfo32
 template <ResultCode func(Core::System&, u32*, u32*, u32, u32, u32, u32)>
 void SvcWrap32(Core::System& system) {
@@ -393,18 +430,114 @@ void SvcWrap32(Core::System& system) {
     FuncReturn(system, retval);
 }
 
+// Used by GetSystemTick32
+template <void func(Core::System&, u32*, u32*)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+
+    func(system, &param_1, &param_2);
+    system.CurrentArmInterface().SetReg(0, param_1);
+    system.CurrentArmInterface().SetReg(1, param_2);
+}
+
+// Used by CreateEvent32
+template <ResultCode func(Core::System&, Handle*, Handle*)>
+void SvcWrap32(Core::System& system) {
+    Handle param_1 = 0;
+    Handle param_2 = 0;
+
+    const u32 retval = func(system, &param_1, &param_2).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    FuncReturn(system, retval);
+}
+
+// Used by GetThreadId32
+template <ResultCode func(Core::System&, Handle, u32*, u32*, u32*)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+    u32 param_3 = 0;
+
+    const u32 retval = func(system, Param32(system, 2), &param_1, &param_2, &param_3).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    system.CurrentArmInterface().SetReg(3, param_3);
+    FuncReturn(system, retval);
+}
+
 // Used by SignalProcessWideKey32
 template <void func(Core::System&, u32, s32)>
 void SvcWrap32(Core::System& system) {
     func(system, static_cast<u32>(Param(system, 0)), static_cast<s32>(Param(system, 1)));
 }
 
-// Used by SendSyncRequest32
+// Used by SetThreadPriority32
+template <ResultCode func(Core::System&, Handle, u32)>
+void SvcWrap32(Core::System& system) {
+    const u32 retval =
+        func(system, static_cast<Handle>(Param(system, 0)), static_cast<u32>(Param(system, 1))).raw;
+    FuncReturn(system, retval);
+}
+
+// Used by SetThreadCoreMask32
+template <ResultCode func(Core::System&, Handle, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    const u32 retval =
+        func(system, static_cast<Handle>(Param(system, 0)), static_cast<u32>(Param(system, 1)),
+             static_cast<u32>(Param(system, 2)), static_cast<u32>(Param(system, 3)))
+            .raw;
+    FuncReturn(system, retval);
+}
+
+// Used by WaitProcessWideKeyAtomic32
+template <ResultCode func(Core::System&, u32, u32, Handle, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    const u32 retval =
+        func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1)),
+             static_cast<Handle>(Param(system, 2)), static_cast<u32>(Param(system, 3)),
+             static_cast<u32>(Param(system, 4)))
+            .raw;
+    FuncReturn(system, retval);
+}
+
+// Used by WaitForAddress32
+template <ResultCode func(Core::System&, u32, u32, s32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    const u32 retval = func(system, static_cast<u32>(Param(system, 0)),
+                            static_cast<u32>(Param(system, 1)), static_cast<s32>(Param(system, 2)),
+                            static_cast<u32>(Param(system, 3)), static_cast<u32>(Param(system, 4)))
+                           .raw;
+    FuncReturn(system, retval);
+}
+
+// Used by SignalToAddress32
+template <ResultCode func(Core::System&, u32, u32, s32, s32)>
+void SvcWrap32(Core::System& system) {
+    const u32 retval =
+        func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1)),
+             static_cast<s32>(Param(system, 2)), static_cast<s32>(Param(system, 3)))
+            .raw;
+    FuncReturn(system, retval);
+}
+
+// Used by SendSyncRequest32, ArbitrateUnlock32
 template <ResultCode func(Core::System&, u32)>
 void SvcWrap32(Core::System& system) {
     FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
 }
 
+// Used by CreateTransferMemory32
+template <ResultCode func(Core::System&, Handle*, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    Handle handle = 0;
+    const u32 retval =
+        func(system, &handle, Param32(system, 1), Param32(system, 2), Param32(system, 3)).raw;
+    system.CurrentArmInterface().SetReg(1, handle);
+    FuncReturn(system, retval);
+}
+
 // Used by WaitSynchronization32
 template <ResultCode func(Core::System&, u32, u32, s32, u32, Handle*)>
 void SvcWrap32(Core::System& system) {
diff --git a/src/core/hle/kernel/synchronization.cpp b/src/core/hle/kernel/synchronization.cpp
index dc37fad1a..851b702a5 100644
--- a/src/core/hle/kernel/synchronization.cpp
+++ b/src/core/hle/kernel/synchronization.cpp
@@ -10,78 +10,107 @@
 #include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 
 namespace Kernel {
 
-/// Default thread wakeup callback for WaitSynchronization
-static bool DefaultThreadWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                        std::shared_ptr<SynchronizationObject> object,
-                                        std::size_t index) {
-    ASSERT(thread->GetStatus() == ThreadStatus::WaitSynch);
-
-    if (reason == ThreadWakeupReason::Timeout) {
-        thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
-        return true;
-    }
-
-    ASSERT(reason == ThreadWakeupReason::Signal);
-    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
-    thread->SetWaitSynchronizationOutput(static_cast<u32>(index));
-    return true;
-}
-
 Synchronization::Synchronization(Core::System& system) : system{system} {}
 
 void Synchronization::SignalObject(SynchronizationObject& obj) const {
+    auto& kernel = system.Kernel();
+    SchedulerLock lock(kernel);
+    auto& time_manager = kernel.TimeManager();
     if (obj.IsSignaled()) {
-        obj.WakeupAllWaitingThreads();
+        for (auto thread : obj.GetWaitingThreads()) {
+            if (thread->GetSchedulingStatus() == ThreadSchedStatus::Paused) {
+                if (thread->GetStatus() != ThreadStatus::WaitHLEEvent) {
+                    ASSERT(thread->GetStatus() == ThreadStatus::WaitSynch);
+                    ASSERT(thread->IsWaitingSync());
+                }
+                thread->SetSynchronizationResults(&obj, RESULT_SUCCESS);
+                thread->ResumeFromWait();
+            }
+        }
+        obj.ClearWaitingThreads();
     }
 }
 
 std::pair<ResultCode, Handle> Synchronization::WaitFor(
     std::vector<std::shared_ptr<SynchronizationObject>>& sync_objects, s64 nano_seconds) {
+    auto& kernel = system.Kernel();
     auto* const thread = system.CurrentScheduler().GetCurrentThread();
-    // Find the first object that is acquirable in the provided list of objects
-    const auto itr = std::find_if(sync_objects.begin(), sync_objects.end(),
-                                  [thread](const std::shared_ptr<SynchronizationObject>& object) {
-                                      return object->IsSignaled();
-                                  });
-
-    if (itr != sync_objects.end()) {
-        // We found a ready object, acquire it and set the result value
-        SynchronizationObject* object = itr->get();
-        object->Acquire(thread);
-        const u32 index = static_cast<s32>(std::distance(sync_objects.begin(), itr));
-        return {RESULT_SUCCESS, index};
+    Handle event_handle = InvalidHandle;
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, thread, nano_seconds);
+        const auto itr =
+            std::find_if(sync_objects.begin(), sync_objects.end(),
+                         [thread](const std::shared_ptr<SynchronizationObject>& object) {
+                             return object->IsSignaled();
+                         });
+
+        if (itr != sync_objects.end()) {
+            // We found a ready object, acquire it and set the result value
+            SynchronizationObject* object = itr->get();
+            object->Acquire(thread);
+            const u32 index = static_cast<s32>(std::distance(sync_objects.begin(), itr));
+            lock.CancelSleep();
+            return {RESULT_SUCCESS, index};
+        }
+
+        if (nano_seconds == 0) {
+            lock.CancelSleep();
+            return {RESULT_TIMEOUT, InvalidHandle};
+        }
+
+        if (thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return {ERR_THREAD_TERMINATING, InvalidHandle};
+        }
+
+        if (thread->IsSyncCancelled()) {
+            thread->SetSyncCancelled(false);
+            lock.CancelSleep();
+            return {ERR_SYNCHRONIZATION_CANCELED, InvalidHandle};
+        }
+
+        for (auto& object : sync_objects) {
+            object->AddWaitingThread(SharedFrom(thread));
+        }
+
+        thread->SetSynchronizationObjects(&sync_objects);
+        thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        thread->SetStatus(ThreadStatus::WaitSynch);
+        thread->SetWaitingSync(true);
     }
+    thread->SetWaitingSync(false);
 
-    // No objects were ready to be acquired, prepare to suspend the thread.
-
-    // If a timeout value of 0 was provided, just return the Timeout error code instead of
-    // suspending the thread.
-    if (nano_seconds == 0) {
-        return {RESULT_TIMEOUT, InvalidHandle};
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
     }
 
-    if (thread->IsSyncCancelled()) {
-        thread->SetSyncCancelled(false);
-        return {ERR_SYNCHRONIZATION_CANCELED, InvalidHandle};
+    {
+        SchedulerLock lock(kernel);
+        ResultCode signaling_result = thread->GetSignalingResult();
+        SynchronizationObject* signaling_object = thread->GetSignalingObject();
+        thread->SetSynchronizationObjects(nullptr);
+        auto shared_thread = SharedFrom(thread);
+        for (auto& obj : sync_objects) {
+            obj->RemoveWaitingThread(shared_thread);
+        }
+        if (signaling_object != nullptr) {
+            const auto itr = std::find_if(
+                sync_objects.begin(), sync_objects.end(),
+                [signaling_object](const std::shared_ptr<SynchronizationObject>& object) {
+                    return object.get() == signaling_object;
+                });
+            ASSERT(itr != sync_objects.end());
+            signaling_object->Acquire(thread);
+            const u32 index = static_cast<s32>(std::distance(sync_objects.begin(), itr));
+            return {signaling_result, index};
+        }
+        return {signaling_result, -1};
     }
-
-    for (auto& object : sync_objects) {
-        object->AddWaitingThread(SharedFrom(thread));
-    }
-
-    thread->SetSynchronizationObjects(std::move(sync_objects));
-    thread->SetStatus(ThreadStatus::WaitSynch);
-
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    thread->WakeAfterDelay(nano_seconds);
-    thread->SetWakeupCallback(DefaultThreadWakeupCallback);
-
-    system.PrepareReschedule(thread->GetProcessorID());
-
-    return {RESULT_TIMEOUT, InvalidHandle};
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/synchronization_object.cpp b/src/core/hle/kernel/synchronization_object.cpp
index 43f3eef18..ba4d39157 100644
--- a/src/core/hle/kernel/synchronization_object.cpp
+++ b/src/core/hle/kernel/synchronization_object.cpp
@@ -38,68 +38,8 @@ void SynchronizationObject::RemoveWaitingThread(std::shared_ptr<Thread> thread)
         waiting_threads.erase(itr);
 }
 
-std::shared_ptr<Thread> SynchronizationObject::GetHighestPriorityReadyThread() const {
-    Thread* candidate = nullptr;
-    u32 candidate_priority = THREADPRIO_LOWEST + 1;
-
-    for (const auto& thread : waiting_threads) {
-        const ThreadStatus thread_status = thread->GetStatus();
-
-        // The list of waiting threads must not contain threads that are not waiting to be awakened.
-        ASSERT_MSG(thread_status == ThreadStatus::WaitSynch ||
-                       thread_status == ThreadStatus::WaitHLEEvent,
-                   "Inconsistent thread statuses in waiting_threads");
-
-        if (thread->GetPriority() >= candidate_priority)
-            continue;
-
-        if (ShouldWait(thread.get()))
-            continue;
-
-        candidate = thread.get();
-        candidate_priority = thread->GetPriority();
-    }
-
-    return SharedFrom(candidate);
-}
-
-void SynchronizationObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
-    ASSERT(!ShouldWait(thread.get()));
-
-    if (!thread) {
-        return;
-    }
-
-    if (thread->IsSleepingOnWait()) {
-        for (const auto& object : thread->GetSynchronizationObjects()) {
-            ASSERT(!object->ShouldWait(thread.get()));
-            object->Acquire(thread.get());
-        }
-    } else {
-        Acquire(thread.get());
-    }
-
-    const std::size_t index = thread->GetSynchronizationObjectIndex(SharedFrom(this));
-
-    thread->ClearSynchronizationObjects();
-
-    thread->CancelWakeupTimer();
-
-    bool resume = true;
-    if (thread->HasWakeupCallback()) {
-        resume = thread->InvokeWakeupCallback(ThreadWakeupReason::Signal, thread, SharedFrom(this),
-                                              index);
-    }
-    if (resume) {
-        thread->ResumeFromWait();
-        kernel.PrepareReschedule(thread->GetProcessorID());
-    }
-}
-
-void SynchronizationObject::WakeupAllWaitingThreads() {
-    while (auto thread = GetHighestPriorityReadyThread()) {
-        WakeupWaitingThread(thread);
-    }
+void SynchronizationObject::ClearWaitingThreads() {
+    waiting_threads.clear();
 }
 
 const std::vector<std::shared_ptr<Thread>>& SynchronizationObject::GetWaitingThreads() const {
diff --git a/src/core/hle/kernel/synchronization_object.h b/src/core/hle/kernel/synchronization_object.h
index 741c31faf..f89b24204 100644
--- a/src/core/hle/kernel/synchronization_object.h
+++ b/src/core/hle/kernel/synchronization_object.h
@@ -12,6 +12,7 @@
 namespace Kernel {
 
 class KernelCore;
+class Synchronization;
 class Thread;
 
 /// Class that represents a Kernel object that a thread can be waiting on
@@ -49,24 +50,11 @@ public:
      */
     void RemoveWaitingThread(std::shared_ptr<Thread> thread);
 
-    /**
-     * Wake up all threads waiting on this object that can be awoken, in priority order,
-     * and set the synchronization result and output of the thread.
-     */
-    void WakeupAllWaitingThreads();
-
-    /**
-     * Wakes up a single thread waiting on this object.
-     * @param thread Thread that is waiting on this object to wakeup.
-     */
-    void WakeupWaitingThread(std::shared_ptr<Thread> thread);
-
-    /// Obtains the highest priority thread that is ready to run from this object's waiting list.
-    std::shared_ptr<Thread> GetHighestPriorityReadyThread() const;
-
     /// Get a const reference to the waiting threads list for debug use
     const std::vector<std::shared_ptr<Thread>>& GetWaitingThreads() const;
 
+    void ClearWaitingThreads();
+
 protected:
     bool is_signaled{}; // Tells if this sync object is signalled;
 
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index db7f379ac..2b1092697 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -9,12 +9,21 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/fiber.h"
 #include "common/logging/log.h"
 #include "common/thread_queue_list.h"
 #include "core/arm/arm_interface.h"
+#ifdef ARCHITECTURE_x86_64
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#endif
+#include "core/arm/cpu_interrupt_handler.h"
+#include "core/arm/exclusive_monitor.h"
+#include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
@@ -23,6 +32,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
 
@@ -44,46 +54,26 @@ Thread::Thread(KernelCore& kernel) : SynchronizationObject{kernel} {}
 Thread::~Thread() = default;
 
 void Thread::Stop() {
-    // Cancel any outstanding wakeup events for this thread
-    Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             global_handle);
-    kernel.GlobalHandleTable().Close(global_handle);
-    global_handle = 0;
-    SetStatus(ThreadStatus::Dead);
-    Signal();
-
-    // Clean up any dangling references in objects that this thread was waiting for
-    for (auto& wait_object : wait_objects) {
-        wait_object->RemoveWaitingThread(SharedFrom(this));
-    }
-    wait_objects.clear();
-
-    owner_process->UnregisterThread(this);
-
-    // Mark the TLS slot in the thread's page as free.
-    owner_process->FreeTLSRegion(tls_address);
-}
-
-void Thread::WakeAfterDelay(s64 nanoseconds) {
-    // Don't schedule a wakeup if the thread wants to wait forever
-    if (nanoseconds == -1)
-        return;
+    {
+        SchedulerLock lock(kernel);
+        SetStatus(ThreadStatus::Dead);
+        Signal();
+        kernel.GlobalHandleTable().Close(global_handle);
 
-    // This function might be called from any thread so we have to be cautious and use the
-    // thread-safe version of ScheduleEvent.
-    const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
-    Core::System::GetInstance().CoreTiming().ScheduleEvent(
-        cycles, kernel.ThreadWakeupCallbackEventType(), global_handle);
-}
+        if (owner_process) {
+            owner_process->UnregisterThread(this);
 
-void Thread::CancelWakeupTimer() {
-    Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             global_handle);
+            // Mark the TLS slot in the thread's page as free.
+            owner_process->FreeTLSRegion(tls_address);
+        }
+        arm_interface.reset();
+        has_exited = true;
+    }
+    global_handle = 0;
 }
 
 void Thread::ResumeFromWait() {
-    ASSERT_MSG(wait_objects.empty(), "Thread is waking up while waiting for objects");
-
+    SchedulerLock lock(kernel);
     switch (status) {
     case ThreadStatus::Paused:
     case ThreadStatus::WaitSynch:
@@ -99,7 +89,7 @@ void Thread::ResumeFromWait() {
     case ThreadStatus::Ready:
         // The thread's wakeup callback must have already been cleared when the thread was first
         // awoken.
-        ASSERT(wakeup_callback == nullptr);
+        ASSERT(hle_callback == nullptr);
         // If the thread is waiting on multiple wait objects, it might be awoken more than once
         // before actually resuming. We can ignore subsequent wakeups if the thread status has
         // already been set to ThreadStatus::Ready.
@@ -115,24 +105,31 @@ void Thread::ResumeFromWait() {
         return;
     }
 
-    wakeup_callback = nullptr;
+    SetStatus(ThreadStatus::Ready);
+}
+
+void Thread::OnWakeUp() {
+    SchedulerLock lock(kernel);
 
-    if (activity == ThreadActivity::Paused) {
-        SetStatus(ThreadStatus::Paused);
-        return;
-    }
+    SetStatus(ThreadStatus::Ready);
+}
 
+ResultCode Thread::Start() {
+    SchedulerLock lock(kernel);
     SetStatus(ThreadStatus::Ready);
+    return RESULT_SUCCESS;
 }
 
 void Thread::CancelWait() {
-    if (GetSchedulingStatus() != ThreadSchedStatus::Paused) {
+    SchedulerLock lock(kernel);
+    if (GetSchedulingStatus() != ThreadSchedStatus::Paused || !is_waiting_on_sync) {
         is_sync_cancelled = true;
         return;
     }
+    // TODO(Blinkhawk): Implement cancel of server session
     is_sync_cancelled = false;
-    SetWaitSynchronizationResult(ERR_SYNCHRONIZATION_CANCELED);
-    ResumeFromWait();
+    SetSynchronizationResults(nullptr, ERR_SYNCHRONIZATION_CANCELED);
+    SetStatus(ThreadStatus::Ready);
 }
 
 static void ResetThreadContext32(Core::ARM_Interface::ThreadContext32& context, u32 stack_top,
@@ -153,12 +150,29 @@ static void ResetThreadContext64(Core::ARM_Interface::ThreadContext64& context,
     context.fpcr = 0;
 }
 
-ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::string name,
-                                                  VAddr entry_point, u32 priority, u64 arg,
-                                                  s32 processor_id, VAddr stack_top,
-                                                  Process& owner_process) {
+std::shared_ptr<Common::Fiber>& Thread::GetHostContext() {
+    return host_context;
+}
+
+ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadType type_flags,
+                                                  std::string name, VAddr entry_point, u32 priority,
+                                                  u64 arg, s32 processor_id, VAddr stack_top,
+                                                  Process* owner_process) {
+    std::function<void(void*)> init_func = system.GetCpuManager().GetGuestThreadStartFunc();
+    void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+    return Create(system, type_flags, name, entry_point, priority, arg, processor_id, stack_top,
+                  owner_process, std::move(init_func), init_func_parameter);
+}
+
+ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadType type_flags,
+                                                  std::string name, VAddr entry_point, u32 priority,
+                                                  u64 arg, s32 processor_id, VAddr stack_top,
+                                                  Process* owner_process,
+                                                  std::function<void(void*)>&& thread_start_func,
+                                                  void* thread_start_parameter) {
+    auto& kernel = system.Kernel();
     // Check if priority is in ranged. Lowest priority -> highest priority id.
-    if (priority > THREADPRIO_LOWEST) {
+    if (priority > THREADPRIO_LOWEST && ((type_flags & THREADTYPE_IDLE) == 0)) {
         LOG_ERROR(Kernel_SVC, "Invalid thread priority: {}", priority);
         return ERR_INVALID_THREAD_PRIORITY;
     }
@@ -168,11 +182,12 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
         return ERR_INVALID_PROCESSOR_ID;
     }
 
-    auto& system = Core::System::GetInstance();
-    if (!system.Memory().IsValidVirtualAddress(owner_process, entry_point)) {
-        LOG_ERROR(Kernel_SVC, "(name={}): invalid entry {:016X}", name, entry_point);
-        // TODO (bunnei): Find the correct error code to use here
-        return RESULT_UNKNOWN;
+    if (owner_process) {
+        if (!system.Memory().IsValidVirtualAddress(*owner_process, entry_point)) {
+            LOG_ERROR(Kernel_SVC, "(name={}): invalid entry {:016X}", name, entry_point);
+            // TODO (bunnei): Find the correct error code to use here
+            return RESULT_UNKNOWN;
+        }
     }
 
     std::shared_ptr<Thread> thread = std::make_shared<Thread>(kernel);
@@ -183,51 +198,82 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
     thread->stack_top = stack_top;
     thread->tpidr_el0 = 0;
     thread->nominal_priority = thread->current_priority = priority;
-    thread->last_running_ticks = system.CoreTiming().GetTicks();
+    thread->last_running_ticks = 0;
     thread->processor_id = processor_id;
     thread->ideal_core = processor_id;
     thread->affinity_mask = 1ULL << processor_id;
-    thread->wait_objects.clear();
+    thread->wait_objects = nullptr;
     thread->mutex_wait_address = 0;
     thread->condvar_wait_address = 0;
     thread->wait_handle = 0;
     thread->name = std::move(name);
     thread->global_handle = kernel.GlobalHandleTable().Create(thread).Unwrap();
-    thread->owner_process = &owner_process;
-    auto& scheduler = kernel.GlobalScheduler();
-    scheduler.AddThread(thread);
-    thread->tls_address = thread->owner_process->CreateTLSRegion();
-
-    thread->owner_process->RegisterThread(thread.get());
+    thread->owner_process = owner_process;
+    thread->type = type_flags;
+    if ((type_flags & THREADTYPE_IDLE) == 0) {
+        auto& scheduler = kernel.GlobalScheduler();
+        scheduler.AddThread(thread);
+    }
+    if (owner_process) {
+        thread->tls_address = thread->owner_process->CreateTLSRegion();
+        thread->owner_process->RegisterThread(thread.get());
+    } else {
+        thread->tls_address = 0;
+    }
+    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
+    // to initialize the context
+    thread->arm_interface.reset();
+    if ((type_flags & THREADTYPE_HLE) == 0) {
+#ifdef ARCHITECTURE_x86_64
+        if (owner_process && !owner_process->Is64BitProcess()) {
+            thread->arm_interface = std::make_unique<Core::ARM_Dynarmic_32>(
+                system, kernel.Interrupts(), kernel.IsMulticore(), kernel.GetExclusiveMonitor(),
+                processor_id);
+        } else {
+            thread->arm_interface = std::make_unique<Core::ARM_Dynarmic_64>(
+                system, kernel.Interrupts(), kernel.IsMulticore(), kernel.GetExclusiveMonitor(),
+                processor_id);
+        }
 
-    ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
-                         static_cast<u32>(entry_point), static_cast<u32>(arg));
-    ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);
+#else
+        if (owner_process && !owner_process->Is64BitProcess()) {
+            thread->arm_interface = std::make_shared<Core::ARM_Unicorn>(
+                system, kernel.Interrupts(), kernel.IsMulticore(), ARM_Unicorn::Arch::AArch32,
+                processor_id);
+        } else {
+            thread->arm_interface = std::make_shared<Core::ARM_Unicorn>(
+                system, kernel.Interrupts(), kernel.IsMulticore(), ARM_Unicorn::Arch::AArch64,
+                processor_id);
+        }
+        LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
+#endif
+        ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
+                             static_cast<u32>(entry_point), static_cast<u32>(arg));
+        ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);
+    }
+    thread->host_context =
+        std::make_shared<Common::Fiber>(std::move(thread_start_func), thread_start_parameter);
 
     return MakeResult<std::shared_ptr<Thread>>(std::move(thread));
 }
 
 void Thread::SetPriority(u32 priority) {
+    SchedulerLock lock(kernel);
     ASSERT_MSG(priority <= THREADPRIO_LOWEST && priority >= THREADPRIO_HIGHEST,
                "Invalid priority value.");
     nominal_priority = priority;
     UpdatePriority();
 }
 
-void Thread::SetWaitSynchronizationResult(ResultCode result) {
-    context_32.cpu_registers[0] = result.raw;
-    context_64.cpu_registers[0] = result.raw;
-}
-
-void Thread::SetWaitSynchronizationOutput(s32 output) {
-    context_32.cpu_registers[1] = output;
-    context_64.cpu_registers[1] = output;
+void Thread::SetSynchronizationResults(SynchronizationObject* object, ResultCode result) {
+    signaling_object = object;
+    signaling_result = result;
 }
 
 s32 Thread::GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const {
-    ASSERT_MSG(!wait_objects.empty(), "Thread is not waiting for anything");
-    const auto match = std::find(wait_objects.rbegin(), wait_objects.rend(), object);
-    return static_cast<s32>(std::distance(match, wait_objects.rend()) - 1);
+    ASSERT_MSG(!wait_objects->empty(), "Thread is not waiting for anything");
+    const auto match = std::find(wait_objects->rbegin(), wait_objects->rend(), object);
+    return static_cast<s32>(std::distance(match, wait_objects->rend()) - 1);
 }
 
 VAddr Thread::GetCommandBufferAddress() const {
@@ -236,6 +282,14 @@ VAddr Thread::GetCommandBufferAddress() const {
     return GetTLSAddress() + command_header_offset;
 }
 
+Core::ARM_Interface& Thread::ArmInterface() {
+    return *arm_interface;
+}
+
+const Core::ARM_Interface& Thread::ArmInterface() const {
+    return *arm_interface;
+}
+
 void Thread::SetStatus(ThreadStatus new_status) {
     if (new_status == status) {
         return;
@@ -257,10 +311,6 @@ void Thread::SetStatus(ThreadStatus new_status) {
         break;
     }
 
-    if (status == ThreadStatus::Running) {
-        last_running_ticks = Core::System::GetInstance().CoreTiming().GetTicks();
-    }
-
     status = new_status;
 }
 
@@ -341,75 +391,116 @@ void Thread::UpdatePriority() {
     lock_owner->UpdatePriority();
 }
 
-void Thread::ChangeCore(u32 core, u64 mask) {
-    SetCoreAndAffinityMask(core, mask);
-}
-
 bool Thread::AllSynchronizationObjectsReady() const {
-    return std::none_of(wait_objects.begin(), wait_objects.end(),
+    return std::none_of(wait_objects->begin(), wait_objects->end(),
                         [this](const std::shared_ptr<SynchronizationObject>& object) {
                             return object->ShouldWait(this);
                         });
 }
 
-bool Thread::InvokeWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                  std::shared_ptr<SynchronizationObject> object,
-                                  std::size_t index) {
-    ASSERT(wakeup_callback);
-    return wakeup_callback(reason, std::move(thread), std::move(object), index);
+bool Thread::InvokeHLECallback(std::shared_ptr<Thread> thread) {
+    ASSERT(hle_callback);
+    return hle_callback(std::move(thread));
 }
 
-void Thread::SetActivity(ThreadActivity value) {
-    activity = value;
+ResultCode Thread::SetActivity(ThreadActivity value) {
+    SchedulerLock lock(kernel);
+
+    auto sched_status = GetSchedulingStatus();
+
+    if (sched_status != ThreadSchedStatus::Runnable && sched_status != ThreadSchedStatus::Paused) {
+        return ERR_INVALID_STATE;
+    }
+
+    if (IsPendingTermination()) {
+        return RESULT_SUCCESS;
+    }
 
     if (value == ThreadActivity::Paused) {
-        // Set status if not waiting
-        if (status == ThreadStatus::Ready || status == ThreadStatus::Running) {
-            SetStatus(ThreadStatus::Paused);
-            kernel.PrepareReschedule(processor_id);
+        if ((pausing_state & static_cast<u32>(ThreadSchedFlags::ThreadPauseFlag)) != 0) {
+            return ERR_INVALID_STATE;
+        }
+        AddSchedulingFlag(ThreadSchedFlags::ThreadPauseFlag);
+    } else {
+        if ((pausing_state & static_cast<u32>(ThreadSchedFlags::ThreadPauseFlag)) == 0) {
+            return ERR_INVALID_STATE;
         }
-    } else if (status == ThreadStatus::Paused) {
-        // Ready to reschedule
-        ResumeFromWait();
+        RemoveSchedulingFlag(ThreadSchedFlags::ThreadPauseFlag);
     }
+    return RESULT_SUCCESS;
 }
 
-void Thread::Sleep(s64 nanoseconds) {
-    // Sleep current thread and check for next thread to schedule
-    SetStatus(ThreadStatus::WaitSleep);
+ResultCode Thread::Sleep(s64 nanoseconds) {
+    Handle event_handle{};
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, this, nanoseconds);
+        SetStatus(ThreadStatus::WaitSleep);
+    }
 
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    WakeAfterDelay(nanoseconds);
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
+    }
+    return RESULT_SUCCESS;
+}
+
+std::pair<ResultCode, bool> Thread::YieldSimple() {
+    bool is_redundant = false;
+    {
+        SchedulerLock lock(kernel);
+        is_redundant = kernel.GlobalScheduler().YieldThread(this);
+    }
+    return {RESULT_SUCCESS, is_redundant};
+}
+
+std::pair<ResultCode, bool> Thread::YieldAndBalanceLoad() {
+    bool is_redundant = false;
+    {
+        SchedulerLock lock(kernel);
+        is_redundant = kernel.GlobalScheduler().YieldThreadAndBalanceLoad(this);
+    }
+    return {RESULT_SUCCESS, is_redundant};
 }
 
-bool Thread::YieldSimple() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThread(this);
+std::pair<ResultCode, bool> Thread::YieldAndWaitForLoadBalancing() {
+    bool is_redundant = false;
+    {
+        SchedulerLock lock(kernel);
+        is_redundant = kernel.GlobalScheduler().YieldThreadAndWaitForLoadBalancing(this);
+    }
+    return {RESULT_SUCCESS, is_redundant};
 }
 
-bool Thread::YieldAndBalanceLoad() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThreadAndBalanceLoad(this);
+void Thread::AddSchedulingFlag(ThreadSchedFlags flag) {
+    const u32 old_state = scheduling_state;
+    pausing_state |= static_cast<u32>(flag);
+    const u32 base_scheduling = static_cast<u32>(GetSchedulingStatus());
+    scheduling_state = base_scheduling | pausing_state;
+    kernel.GlobalScheduler().AdjustSchedulingOnStatus(this, old_state);
 }
 
-bool Thread::YieldAndWaitForLoadBalancing() {
-    auto& scheduler = kernel.GlobalScheduler();
-    return scheduler.YieldThreadAndWaitForLoadBalancing(this);
+void Thread::RemoveSchedulingFlag(ThreadSchedFlags flag) {
+    const u32 old_state = scheduling_state;
+    pausing_state &= ~static_cast<u32>(flag);
+    const u32 base_scheduling = static_cast<u32>(GetSchedulingStatus());
+    scheduling_state = base_scheduling | pausing_state;
+    kernel.GlobalScheduler().AdjustSchedulingOnStatus(this, old_state);
 }
 
 void Thread::SetSchedulingStatus(ThreadSchedStatus new_status) {
-    const u32 old_flags = scheduling_state;
+    const u32 old_state = scheduling_state;
     scheduling_state = (scheduling_state & static_cast<u32>(ThreadSchedMasks::HighMask)) |
                        static_cast<u32>(new_status);
-    AdjustSchedulingOnStatus(old_flags);
+    kernel.GlobalScheduler().AdjustSchedulingOnStatus(this, old_state);
 }
 
 void Thread::SetCurrentPriority(u32 new_priority) {
     const u32 old_priority = std::exchange(current_priority, new_priority);
-    AdjustSchedulingOnPriority(old_priority);
+    kernel.GlobalScheduler().AdjustSchedulingOnPriority(this, old_priority);
 }
 
 ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
+    SchedulerLock lock(kernel);
     const auto HighestSetCore = [](u64 mask, u32 max_cores) {
         for (s32 core = static_cast<s32>(max_cores - 1); core >= 0; core--) {
             if (((mask >> core) & 1) != 0) {
@@ -443,111 +534,12 @@ ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
                     processor_id = ideal_core;
                 }
             }
-            AdjustSchedulingOnAffinity(old_affinity_mask, old_core);
+            kernel.GlobalScheduler().AdjustSchedulingOnAffinity(this, old_affinity_mask, old_core);
         }
     }
     return RESULT_SUCCESS;
 }
 
-void Thread::AdjustSchedulingOnStatus(u32 old_flags) {
-    if (old_flags == scheduling_state) {
-        return;
-    }
-
-    auto& scheduler = kernel.GlobalScheduler();
-    if (static_cast<ThreadSchedStatus>(old_flags & static_cast<u32>(ThreadSchedMasks::LowMask)) ==
-        ThreadSchedStatus::Runnable) {
-        // In this case the thread was running, now it's pausing/exitting
-        if (processor_id >= 0) {
-            scheduler.Unschedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-
-        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-            if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-                scheduler.Unsuggest(current_priority, core, this);
-            }
-        }
-    } else if (GetSchedulingStatus() == ThreadSchedStatus::Runnable) {
-        // The thread is now set to running from being stopped
-        if (processor_id >= 0) {
-            scheduler.Schedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-
-        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-            if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-                scheduler.Suggest(current_priority, core, this);
-            }
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
-void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
-    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable) {
-        return;
-    }
-    auto& scheduler = kernel.GlobalScheduler();
-    if (processor_id >= 0) {
-        scheduler.Unschedule(old_priority, static_cast<u32>(processor_id), this);
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-            scheduler.Unsuggest(old_priority, core, this);
-        }
-    }
-
-    // Add thread to the new priority queues.
-    Thread* current_thread = GetCurrentThread();
-
-    if (processor_id >= 0) {
-        if (current_thread == this) {
-            scheduler.SchedulePrepend(current_priority, static_cast<u32>(processor_id), this);
-        } else {
-            scheduler.Schedule(current_priority, static_cast<u32>(processor_id), this);
-        }
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
-            scheduler.Suggest(current_priority, core, this);
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
-void Thread::AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core) {
-    auto& scheduler = kernel.GlobalScheduler();
-    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable ||
-        current_priority >= THREADPRIO_COUNT) {
-        return;
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (((old_affinity_mask >> core) & 1) != 0) {
-            if (core == static_cast<u32>(old_core)) {
-                scheduler.Unschedule(current_priority, core, this);
-            } else {
-                scheduler.Unsuggest(current_priority, core, this);
-            }
-        }
-    }
-
-    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
-        if (((affinity_mask >> core) & 1) != 0) {
-            if (core == static_cast<u32>(processor_id)) {
-                scheduler.Schedule(current_priority, core, this);
-            } else {
-                scheduler.Suggest(current_priority, core, this);
-            }
-        }
-    }
-
-    scheduler.SetReselectionPending();
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 23fdef8a4..c0342c462 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -6,26 +6,47 @@
 
 #include <functional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/spin_lock.h"
 #include "core/arm/arm_interface.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
+namespace Common {
+class Fiber;
+}
+
+namespace Core {
+class ARM_Interface;
+class System;
+} // namespace Core
+
 namespace Kernel {
 
+class GlobalScheduler;
 class KernelCore;
 class Process;
 class Scheduler;
 
 enum ThreadPriority : u32 {
-    THREADPRIO_HIGHEST = 0,       ///< Highest thread priority
-    THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
-    THREADPRIO_DEFAULT = 44,      ///< Default thread priority for userland apps
-    THREADPRIO_LOWEST = 63,       ///< Lowest thread priority
-    THREADPRIO_COUNT = 64,        ///< Total number of possible thread priorities.
+    THREADPRIO_HIGHEST = 0,            ///< Highest thread priority
+    THREADPRIO_MAX_CORE_MIGRATION = 2, ///< Highest priority for a core migration
+    THREADPRIO_USERLAND_MAX = 24,      ///< Highest thread priority for userland apps
+    THREADPRIO_DEFAULT = 44,           ///< Default thread priority for userland apps
+    THREADPRIO_LOWEST = 63,            ///< Lowest thread priority
+    THREADPRIO_COUNT = 64,             ///< Total number of possible thread priorities.
+};
+
+enum ThreadType : u32 {
+    THREADTYPE_USER = 0x1,
+    THREADTYPE_KERNEL = 0x2,
+    THREADTYPE_HLE = 0x4,
+    THREADTYPE_IDLE = 0x8,
+    THREADTYPE_SUSPEND = 0x10,
 };
 
 enum ThreadProcessorId : s32 {
@@ -107,26 +128,45 @@ public:
 
     using ThreadSynchronizationObjects = std::vector<std::shared_ptr<SynchronizationObject>>;
 
-    using WakeupCallback =
-        std::function<bool(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                           std::shared_ptr<SynchronizationObject> object, std::size_t index)>;
+    using HLECallback = std::function<bool(std::shared_ptr<Thread> thread)>;
+
+    /**
+     * Creates and returns a new thread. The new thread is immediately scheduled
+     * @param system The instance of the whole system
+     * @param name The friendly name desired for the thread
+     * @param entry_point The address at which the thread should start execution
+     * @param priority The thread's priority
+     * @param arg User data to pass to the thread
+     * @param processor_id The ID(s) of the processors on which the thread is desired to be run
+     * @param stack_top The address of the thread's stack top
+     * @param owner_process The parent process for the thread, if null, it's a kernel thread
+     * @return A shared pointer to the newly created thread
+     */
+    static ResultVal<std::shared_ptr<Thread>> Create(Core::System& system, ThreadType type_flags,
+                                                     std::string name, VAddr entry_point,
+                                                     u32 priority, u64 arg, s32 processor_id,
+                                                     VAddr stack_top, Process* owner_process);
 
     /**
      * Creates and returns a new thread. The new thread is immediately scheduled
-     * @param kernel The kernel instance this thread will be created under.
+     * @param system The instance of the whole system
      * @param name The friendly name desired for the thread
      * @param entry_point The address at which the thread should start execution
      * @param priority The thread's priority
      * @param arg User data to pass to the thread
      * @param processor_id The ID(s) of the processors on which the thread is desired to be run
      * @param stack_top The address of the thread's stack top
-     * @param owner_process The parent process for the thread
+     * @param owner_process The parent process for the thread, if null, it's a kernel thread
+     * @param thread_start_func The function where the host context will start.
+     * @param thread_start_parameter The parameter which will passed to host context on init
      * @return A shared pointer to the newly created thread
      */
-    static ResultVal<std::shared_ptr<Thread>> Create(KernelCore& kernel, std::string name,
-                                                     VAddr entry_point, u32 priority, u64 arg,
-                                                     s32 processor_id, VAddr stack_top,
-                                                     Process& owner_process);
+    static ResultVal<std::shared_ptr<Thread>> Create(Core::System& system, ThreadType type_flags,
+                                                     std::string name, VAddr entry_point,
+                                                     u32 priority, u64 arg, s32 processor_id,
+                                                     VAddr stack_top, Process* owner_process,
+                                                     std::function<void(void*)>&& thread_start_func,
+                                                     void* thread_start_parameter);
 
     std::string GetName() const override {
         return name;
@@ -181,7 +221,7 @@ public:
     void UpdatePriority();
 
     /// Changes the core that the thread is running or scheduled to run on.
-    void ChangeCore(u32 core, u64 mask);
+    ResultCode SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask);
 
     /**
      * Gets the thread's thread ID
@@ -194,6 +234,10 @@ public:
     /// Resumes a thread from waiting
     void ResumeFromWait();
 
+    void OnWakeUp();
+
+    ResultCode Start();
+
     /// Cancels a waiting operation that this thread may or may not be within.
     ///
     /// When the thread is within a waiting state, this will set the thread's
@@ -202,26 +246,19 @@ public:
     ///
     void CancelWait();
 
-    /**
-     * Schedules an event to wake up the specified thread after the specified delay
-     * @param nanoseconds The time this thread will be allowed to sleep for
-     */
-    void WakeAfterDelay(s64 nanoseconds);
+    void SetSynchronizationResults(SynchronizationObject* object, ResultCode result);
 
-    /// Cancel any outstanding wakeup events for this thread
-    void CancelWakeupTimer();
+    Core::ARM_Interface& ArmInterface();
 
-    /**
-     * Sets the result after the thread awakens (from svcWaitSynchronization)
-     * @param result Value to set to the returned result
-     */
-    void SetWaitSynchronizationResult(ResultCode result);
+    const Core::ARM_Interface& ArmInterface() const;
 
-    /**
-     * Sets the output parameter value after the thread awakens (from svcWaitSynchronization)
-     * @param output Value to set to the output parameter
-     */
-    void SetWaitSynchronizationOutput(s32 output);
+    SynchronizationObject* GetSignalingObject() const {
+        return signaling_object;
+    }
+
+    ResultCode GetSignalingResult() const {
+        return signaling_result;
+    }
 
     /**
      * Retrieves the index that this particular object occupies in the list of objects
@@ -269,11 +306,6 @@ public:
      */
     VAddr GetCommandBufferAddress() const;
 
-    /// Returns whether this thread is waiting on objects from a WaitSynchronization call.
-    bool IsSleepingOnWait() const {
-        return status == ThreadStatus::WaitSynch;
-    }
-
     ThreadContext32& GetContext32() {
         return context_32;
     }
@@ -290,6 +322,28 @@ public:
         return context_64;
     }
 
+    bool IsHLEThread() const {
+        return (type & THREADTYPE_HLE) != 0;
+    }
+
+    bool IsSuspendThread() const {
+        return (type & THREADTYPE_SUSPEND) != 0;
+    }
+
+    bool IsIdleThread() const {
+        return (type & THREADTYPE_IDLE) != 0;
+    }
+
+    bool WasRunning() const {
+        return was_running;
+    }
+
+    void SetWasRunning(bool value) {
+        was_running = value;
+    }
+
+    std::shared_ptr<Common::Fiber>& GetHostContext();
+
     ThreadStatus GetStatus() const {
         return status;
     }
@@ -325,18 +379,18 @@ public:
     }
 
     const ThreadSynchronizationObjects& GetSynchronizationObjects() const {
-        return wait_objects;
+        return *wait_objects;
     }
 
-    void SetSynchronizationObjects(ThreadSynchronizationObjects objects) {
-        wait_objects = std::move(objects);
+    void SetSynchronizationObjects(ThreadSynchronizationObjects* objects) {
+        wait_objects = objects;
     }
 
     void ClearSynchronizationObjects() {
-        for (const auto& waiting_object : wait_objects) {
+        for (const auto& waiting_object : *wait_objects) {
             waiting_object->RemoveWaitingThread(SharedFrom(this));
         }
-        wait_objects.clear();
+        wait_objects->clear();
     }
 
     /// Determines whether all the objects this thread is waiting on are ready.
@@ -386,26 +440,35 @@ public:
         arb_wait_address = address;
     }
 
-    bool HasWakeupCallback() const {
-        return wakeup_callback != nullptr;
+    bool HasHLECallback() const {
+        return hle_callback != nullptr;
     }
 
-    void SetWakeupCallback(WakeupCallback callback) {
-        wakeup_callback = std::move(callback);
+    void SetHLECallback(HLECallback callback) {
+        hle_callback = std::move(callback);
     }
 
-    void InvalidateWakeupCallback() {
-        SetWakeupCallback(nullptr);
+    void SetHLETimeEvent(Handle time_event) {
+        hle_time_event = time_event;
     }
 
-    /**
-     * Invokes the thread's wakeup callback.
-     *
-     * @pre A valid wakeup callback has been set. Violating this precondition
-     *      will cause an assertion to trigger.
-     */
-    bool InvokeWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                              std::shared_ptr<SynchronizationObject> object, std::size_t index);
+    void SetHLESyncObject(SynchronizationObject* object) {
+        hle_object = object;
+    }
+
+    Handle GetHLETimeEvent() const {
+        return hle_time_event;
+    }
+
+    SynchronizationObject* GetHLESyncObject() const {
+        return hle_object;
+    }
+
+    void InvalidateHLECallback() {
+        SetHLECallback(nullptr);
+    }
+
+    bool InvokeHLECallback(std::shared_ptr<Thread> thread);
 
     u32 GetIdealCore() const {
         return ideal_core;
@@ -415,23 +478,19 @@ public:
         return affinity_mask;
     }
 
-    ThreadActivity GetActivity() const {
-        return activity;
-    }
-
-    void SetActivity(ThreadActivity value);
+    ResultCode SetActivity(ThreadActivity value);
 
     /// Sleeps this thread for the given amount of nanoseconds.
-    void Sleep(s64 nanoseconds);
+    ResultCode Sleep(s64 nanoseconds);
 
     /// Yields this thread without rebalancing loads.
-    bool YieldSimple();
+    std::pair<ResultCode, bool> YieldSimple();
 
     /// Yields this thread and does a load rebalancing.
-    bool YieldAndBalanceLoad();
+    std::pair<ResultCode, bool> YieldAndBalanceLoad();
 
     /// Yields this thread and if the core is left idle, loads are rebalanced
-    bool YieldAndWaitForLoadBalancing();
+    std::pair<ResultCode, bool> YieldAndWaitForLoadBalancing();
 
     void IncrementYieldCount() {
         yield_count++;
@@ -446,6 +505,10 @@ public:
                                               static_cast<u32>(ThreadSchedMasks::LowMask));
     }
 
+    bool IsRunnable() const {
+        return scheduling_state == static_cast<u32>(ThreadSchedStatus::Runnable);
+    }
+
     bool IsRunning() const {
         return is_running;
     }
@@ -466,17 +529,67 @@ public:
         return global_handle;
     }
 
+    bool IsWaitingForArbitration() const {
+        return waiting_for_arbitration;
+    }
+
+    void WaitForArbitration(bool set) {
+        waiting_for_arbitration = set;
+    }
+
+    bool IsWaitingSync() const {
+        return is_waiting_on_sync;
+    }
+
+    void SetWaitingSync(bool is_waiting) {
+        is_waiting_on_sync = is_waiting;
+    }
+
+    bool IsPendingTermination() const {
+        return will_be_terminated || GetSchedulingStatus() == ThreadSchedStatus::Exited;
+    }
+
+    bool IsPaused() const {
+        return pausing_state != 0;
+    }
+
+    bool IsContinuousOnSVC() const {
+        return is_continuous_on_svc;
+    }
+
+    void SetContinuousOnSVC(bool is_continuous) {
+        is_continuous_on_svc = is_continuous;
+    }
+
+    bool IsPhantomMode() const {
+        return is_phantom_mode;
+    }
+
+    void SetPhantomMode(bool phantom) {
+        is_phantom_mode = phantom;
+    }
+
+    bool HasExited() const {
+        return has_exited;
+    }
+
 private:
+    friend class GlobalScheduler;
+    friend class Scheduler;
+
     void SetSchedulingStatus(ThreadSchedStatus new_status);
+    void AddSchedulingFlag(ThreadSchedFlags flag);
+    void RemoveSchedulingFlag(ThreadSchedFlags flag);
+
     void SetCurrentPriority(u32 new_priority);
-    ResultCode SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask);
 
-    void AdjustSchedulingOnStatus(u32 old_flags);
-    void AdjustSchedulingOnPriority(u32 old_priority);
     void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core);
 
+    Common::SpinLock context_guard{};
     ThreadContext32 context_32{};
     ThreadContext64 context_64{};
+    std::unique_ptr<Core::ARM_Interface> arm_interface{};
+    std::shared_ptr<Common::Fiber> host_context{};
 
     u64 thread_id = 0;
 
@@ -485,6 +598,8 @@ private:
     VAddr entry_point = 0;
     VAddr stack_top = 0;
 
+    ThreadType type;
+
     /// Nominal thread priority, as set by the emulated application.
     /// The nominal priority is the thread priority without priority
     /// inheritance taken into account.
@@ -509,7 +624,10 @@ private:
 
     /// Objects that the thread is waiting on, in the same order as they were
     /// passed to WaitSynchronization.
-    ThreadSynchronizationObjects wait_objects;
+    ThreadSynchronizationObjects* wait_objects;
+
+    SynchronizationObject* signaling_object;
+    ResultCode signaling_result{RESULT_SUCCESS};
 
     /// List of threads that are waiting for a mutex that is held by this thread.
     MutexWaitingThreads wait_mutex_threads;
@@ -526,30 +644,39 @@ private:
 
     /// If waiting for an AddressArbiter, this is the address being waited on.
     VAddr arb_wait_address{0};
+    bool waiting_for_arbitration{};
 
     /// Handle used as userdata to reference this object when inserting into the CoreTiming queue.
     Handle global_handle = 0;
 
-    /// Callback that will be invoked when the thread is resumed from a waiting state. If the thread
-    /// was waiting via WaitSynchronization then the object will be the last object that became
-    /// available. In case of a timeout, the object will be nullptr.
-    WakeupCallback wakeup_callback;
+    /// Callback for HLE Events
+    HLECallback hle_callback;
+    Handle hle_time_event;
+    SynchronizationObject* hle_object;
 
     Scheduler* scheduler = nullptr;
 
     u32 ideal_core{0xFFFFFFFF};
     u64 affinity_mask{0x1};
 
-    ThreadActivity activity = ThreadActivity::Normal;
-
     s32 ideal_core_override = -1;
     u64 affinity_mask_override = 0x1;
     u32 affinity_override_count = 0;
 
     u32 scheduling_state = 0;
+    u32 pausing_state = 0;
     bool is_running = false;
+    bool is_waiting_on_sync = false;
     bool is_sync_cancelled = false;
 
+    bool is_continuous_on_svc = false;
+
+    bool will_be_terminated = false;
+    bool is_phantom_mode = false;
+    bool has_exited = false;
+
+    bool was_running = false;
+
     std::string name;
 };
 
diff --git a/src/core/hle/kernel/time_manager.cpp b/src/core/hle/kernel/time_manager.cpp
index 21b290468..941305e8e 100644
--- a/src/core/hle/kernel/time_manager.cpp
+++ b/src/core/hle/kernel/time_manager.cpp
@@ -8,30 +8,37 @@
 #include "core/core_timing_util.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/time_manager.h"
 
 namespace Kernel {
 
-TimeManager::TimeManager(Core::System& system) : system{system} {
+TimeManager::TimeManager(Core::System& system_) : system{system_} {
     time_manager_event_type = Core::Timing::CreateEvent(
         "Kernel::TimeManagerCallback", [this](u64 thread_handle, [[maybe_unused]] s64 cycles_late) {
+            SchedulerLock lock(system.Kernel());
             Handle proper_handle = static_cast<Handle>(thread_handle);
+            if (cancelled_events[proper_handle]) {
+                return;
+            }
             std::shared_ptr<Thread> thread =
                 this->system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
-            thread->ResumeFromWait();
+            thread->OnWakeUp();
         });
 }
 
 void TimeManager::ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds) {
+    event_handle = timetask->GetGlobalHandle();
     if (nanoseconds > 0) {
         ASSERT(timetask);
-        event_handle = timetask->GetGlobalHandle();
-        const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
-        system.CoreTiming().ScheduleEvent(cycles, time_manager_event_type, event_handle);
+        ASSERT(timetask->GetStatus() != ThreadStatus::Ready);
+        ASSERT(timetask->GetStatus() != ThreadStatus::WaitMutex);
+        system.CoreTiming().ScheduleEvent(nanoseconds, time_manager_event_type, event_handle);
     } else {
         event_handle = InvalidHandle;
     }
+    cancelled_events[event_handle] = false;
 }
 
 void TimeManager::UnscheduleTimeEvent(Handle event_handle) {
@@ -39,6 +46,12 @@ void TimeManager::UnscheduleTimeEvent(Handle event_handle) {
         return;
     }
     system.CoreTiming().UnscheduleEvent(time_manager_event_type, event_handle);
+    cancelled_events[event_handle] = true;
+}
+
+void TimeManager::CancelTimeEvent(Thread* time_task) {
+    Handle event_handle = time_task->GetGlobalHandle();
+    UnscheduleTimeEvent(event_handle);
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/time_manager.h b/src/core/hle/kernel/time_manager.h
index eaec486d1..307a18765 100644
--- a/src/core/hle/kernel/time_manager.h
+++ b/src/core/hle/kernel/time_manager.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_map>
 
 #include "core/hle/kernel/object.h"
 
@@ -35,9 +36,12 @@ public:
     /// Unschedule an existing time event
     void UnscheduleTimeEvent(Handle event_handle);
 
+    void CancelTimeEvent(Thread* time_task);
+
 private:
     Core::System& system;
     std::shared_ptr<Core::Timing::EventType> time_manager_event_type;
+    std::unordered_map<Handle, bool> cancelled_events;
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/service/acc/acc.cpp b/src/core/hle/service/acc/acc.cpp
index 630a8b048..8ac856ec3 100644
--- a/src/core/hle/service/acc/acc.cpp
+++ b/src/core/hle/service/acc/acc.cpp
@@ -44,6 +44,218 @@ static constexpr u32 SanitizeJPEGSize(std::size_t size) {
     return static_cast<u32>(std::min(size, max_jpeg_image_size));
 }
 
+class IManagerForSystemService final : public ServiceFramework<IManagerForSystemService> {
+public:
+    explicit IManagerForSystemService(Common::UUID user_id)
+        : ServiceFramework("IManagerForSystemService") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "CheckAvailability"},
+            {1, nullptr, "GetAccountId"},
+            {2, nullptr, "EnsureIdTokenCacheAsync"},
+            {3, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "SetSystemProgramIdentification"},
+            {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+
+            {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+
+            {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+
+            {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0
+            {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+
+            {120, nullptr, "GetNintendoAccountId"},
+            {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+
+            {130, nullptr, "GetNintendoAccountUserResourceCache"},
+            {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"},
+            {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"},
+            {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {134, nullptr, "RefreshNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+
+            {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+
+            {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+
+            {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+
+            {150, nullptr, "CreateAuthorizationRequest"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 3.0.0+
+class IFloatingRegistrationRequest final : public ServiceFramework<IFloatingRegistrationRequest> {
+public:
+    explicit IFloatingRegistrationRequest(Common::UUID user_id)
+        : ServiceFramework("IFloatingRegistrationRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {12, nullptr, "GetAccountId"},
+            {13, nullptr, "GetLinkedNintendoAccountId"},
+            {14, nullptr, "GetNickname"},
+            {15, nullptr, "GetProfileImage"},
+            {21, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "RegisterUser"}, // [1.0.0-3.0.2] RegisterAsync
+            {101, nullptr, "RegisterUserWithUid"}, // [1.0.0-3.0.2] RegisterWithUidAsync
+            {102, nullptr, "RegisterNetworkServiceAccountAsync"}, // 4.0.0+
+            {103, nullptr, "RegisterNetworkServiceAccountWithUidAsync"}, // 4.0.0+
+            {110, nullptr, "SetSystemProgramIdentification"},
+            {111, nullptr, "EnsureIdTokenCacheAsync"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IAdministrator final : public ServiceFramework<IAdministrator> {
+public:
+    explicit IAdministrator(Common::UUID user_id) : ServiceFramework("IAdministrator") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "CheckAvailability"},
+            {1, nullptr, "GetAccountId"},
+            {2, nullptr, "EnsureIdTokenCacheAsync"},
+            {3, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "SetSystemProgramIdentification"},
+            {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+
+            {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+
+            {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+
+            {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0
+            {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+
+            {120, nullptr, "GetNintendoAccountId"},
+            {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+
+            {130, nullptr, "GetNintendoAccountUserResourceCache"},
+            {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"},
+            {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"},
+            {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {134, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsync"}, // 9.0.0+
+            {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+
+            {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+
+            {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+
+            {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+
+            {150, nullptr, "CreateAuthorizationRequest"},
+            {200, nullptr, "IsRegistered"},
+            {201, nullptr, "RegisterAsync"},
+            {202, nullptr, "UnregisterAsync"},
+            {203, nullptr, "DeleteRegistrationInfoLocally"},
+            {220, nullptr, "SynchronizeProfileAsync"},
+            {221, nullptr, "UploadProfileAsync"},
+            {222, nullptr, "SynchronizaProfileAsyncIfSecondsElapsed"},
+            {250, nullptr, "IsLinkedWithNintendoAccount"},
+            {251, nullptr, "CreateProcedureToLinkWithNintendoAccount"},
+            {252, nullptr, "ResumeProcedureToLinkWithNintendoAccount"},
+            {255, nullptr, "CreateProcedureToUpdateLinkageStateOfNintendoAccount"},
+            {256, nullptr, "ResumeProcedureToUpdateLinkageStateOfNintendoAccount"},
+            {260, nullptr, "CreateProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+
+            {261, nullptr, "ResumeProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+
+            {280, nullptr, "ProxyProcedureToAcquireApplicationAuthorizationForNintendoAccount"},
+            {290, nullptr, "GetRequestForNintendoAccountUserResourceView"}, // 8.0.0+
+            {300, nullptr, "TryRecoverNintendoAccountUserStateAsync"}, // 6.0.0+
+            {400, nullptr, "IsServiceEntryRequirementCacheRefreshRequiredForOnlinePlay"}, // 6.1.0+
+            {401, nullptr, "RefreshServiceEntryRequirementCacheForOnlinePlayAsync"}, // 6.1.0+
+            {900, nullptr, "GetAuthenticationInfoForWin"}, // 9.0.0+
+            {901, nullptr, "ImportAsyncForWin"}, // 9.0.0+
+            {997, nullptr, "DebugUnlinkNintendoAccountAsync"},
+            {998, nullptr, "DebugSetAvailabilityErrorDetail"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IAuthorizationRequest final : public ServiceFramework<IAuthorizationRequest> {
+public:
+    explicit IAuthorizationRequest(Common::UUID user_id)
+        : ServiceFramework("IAuthorizationRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {10, nullptr, "InvokeWithoutInteractionAsync"},
+            {19, nullptr, "IsAuthorized"},
+            {20, nullptr, "GetAuthorizationCode"},
+            {21, nullptr, "GetIdToken"},
+            {22, nullptr, "GetState"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IOAuthProcedure final : public ServiceFramework<IOAuthProcedure> {
+public:
+    explicit IOAuthProcedure(Common::UUID user_id) : ServiceFramework("IOAuthProcedure") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 3.0.0+
+class IOAuthProcedureForExternalNsa final : public ServiceFramework<IOAuthProcedureForExternalNsa> {
+public:
+    explicit IOAuthProcedureForExternalNsa(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForExternalNsa") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetAccountId"},
+            {101, nullptr, "GetLinkedNintendoAccountId"},
+            {102, nullptr, "GetNickname"},
+            {103, nullptr, "GetProfileImage"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IOAuthProcedureForNintendoAccountLinkage final
+    : public ServiceFramework<IOAuthProcedureForNintendoAccountLinkage> {
+public:
+    explicit IOAuthProcedureForNintendoAccountLinkage(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForNintendoAccountLinkage") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetRequestWithTheme"},
+            {101, nullptr, "IsNetworkServiceAccountReplaced"},
+            {199, nullptr, "GetUrlForIntroductionOfExtraMembership"}, // 2.0.0 - 5.1.0
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class INotifier final : public ServiceFramework<INotifier> {
+public:
+    explicit INotifier(Common::UUID user_id) : ServiceFramework("INotifier") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 class IProfileCommon : public ServiceFramework<IProfileCommon> {
 public:
     explicit IProfileCommon(const char* name, bool editor_commands, Common::UUID user_id,
@@ -226,6 +438,54 @@ public:
         : IProfileCommon("IProfileEditor", true, user_id, profile_manager) {}
 };
 
+class IAsyncContext final : public ServiceFramework<IAsyncContext> {
+public:
+    explicit IAsyncContext(Common::UUID user_id) : ServiceFramework("IAsyncContext") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "HasDone"},
+            {3, nullptr, "GetResult"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class ISessionObject final : public ServiceFramework<ISessionObject> {
+public:
+    explicit ISessionObject(Common::UUID user_id) : ServiceFramework("ISessionObject") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {999, nullptr, "Dummy"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IGuestLoginRequest final : public ServiceFramework<IGuestLoginRequest> {
+public:
+    explicit IGuestLoginRequest(Common::UUID) : ServiceFramework("IGuestLoginRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {11, nullptr, "Unknown"}, // 1.0.0 - 2.3.0 (the name is blank on Switchbrew)
+            {12, nullptr, "GetAccountId"},
+            {13, nullptr, "GetLinkedNintendoAccountId"},
+            {14, nullptr, "GetNickname"},
+            {15, nullptr, "GetProfileImage"},
+            {21, nullptr, "LoadIdTokenCache"}, // 3.0.0+
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 class IManagerForApplication final : public ServiceFramework<IManagerForApplication> {
 public:
     explicit IManagerForApplication(Common::UUID user_id)
@@ -265,6 +525,87 @@ private:
     Common::UUID user_id;
 };
 
+// 6.0.0+
+class IAsyncNetworkServiceLicenseKindContext final
+    : public ServiceFramework<IAsyncNetworkServiceLicenseKindContext> {
+public:
+    explicit IAsyncNetworkServiceLicenseKindContext(Common::UUID user_id)
+        : ServiceFramework("IAsyncNetworkServiceLicenseKindContext") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "HasDone"},
+            {3, nullptr, "GetResult"},
+            {4, nullptr, "GetNetworkServiceLicenseKind"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 8.0.0+
+class IOAuthProcedureForUserRegistration final
+    : public ServiceFramework<IOAuthProcedureForUserRegistration> {
+public:
+    explicit IOAuthProcedureForUserRegistration(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForUserRegistration") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetAccountId"},
+            {101, nullptr, "GetLinkedNintendoAccountId"},
+            {102, nullptr, "GetNickname"},
+            {103, nullptr, "GetProfileImage"},
+            {110, nullptr, "RegisterUserAsync"},
+            {111, nullptr, "GetUid"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class DAUTH_O final : public ServiceFramework<DAUTH_O> {
+public:
+    explicit DAUTH_O(Common::UUID) : ServiceFramework("dauth:o") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "EnsureAuthenticationTokenCacheAsync"}, // [5.0.0-5.1.0] GeneratePostData
+            {1, nullptr, "LoadAuthenticationTokenCache"}, // 6.0.0+
+            {2, nullptr, "InvalidateAuthenticationTokenCache"}, // 6.0.0+
+            {10, nullptr, "EnsureEdgeTokenCacheAsync"}, // 6.0.0+
+            {11, nullptr, "LoadEdgeTokenCache"}, // 6.0.0+
+            {12, nullptr, "InvalidateEdgeTokenCache"}, // 6.0.0+
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 6.0.0+
+class IAsyncResult final : public ServiceFramework<IAsyncResult> {
+public:
+    explicit IAsyncResult(Common::UUID user_id) : ServiceFramework("IAsyncResult") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetResult"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "IsAvailable"},
+            {3, nullptr, "GetSystemEvent"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 void Module::Interface::GetUserCount(Kernel::HLERequestContext& ctx) {
     LOG_DEBUG(Service_ACC, "called");
     IPC::ResponseBuilder rb{ctx, 3};
@@ -435,6 +776,15 @@ void Module::Interface::ListQualifiedUsers(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
+void Module::Interface::ListOpenContextStoredUsers(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service_ACC, "(STUBBED) called");
+
+    // TODO(ogniK): Handle open contexts
+    ctx.WriteBuffer(profile_manager->GetOpenUsers());
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void Module::Interface::TrySelectUserWithoutInteraction(Kernel::HLERequestContext& ctx) {
     LOG_DEBUG(Service_ACC, "called");
     // A u8 is passed into this function which we can safely ignore. It's to determine if we have
diff --git a/src/core/hle/service/acc/acc.h b/src/core/hle/service/acc/acc.h
index 74ca39d6e..d4c6395c6 100644
--- a/src/core/hle/service/acc/acc.h
+++ b/src/core/hle/service/acc/acc.h
@@ -34,6 +34,7 @@ public:
         void IsUserAccountSwitchLocked(Kernel::HLERequestContext& ctx);
         void GetProfileEditor(Kernel::HLERequestContext& ctx);
         void ListQualifiedUsers(Kernel::HLERequestContext& ctx);
+        void ListOpenContextStoredUsers(Kernel::HLERequestContext& ctx);
 
     private:
         ResultCode InitializeApplicationInfoBase();
diff --git a/src/core/hle/service/acc/acc_aa.cpp b/src/core/hle/service/acc/acc_aa.cpp
index 3bac6bcd1..51f119b12 100644
--- a/src/core/hle/service/acc/acc_aa.cpp
+++ b/src/core/hle/service/acc/acc_aa.cpp
@@ -13,8 +13,8 @@ ACC_AA::ACC_AA(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
         {0, nullptr, "EnsureCacheAsync"},
         {1, nullptr, "LoadCache"},
         {2, nullptr, "GetDeviceAccountId"},
-        {50, nullptr, "RegisterNotificationTokenAsync"},
-        {51, nullptr, "UnregisterNotificationTokenAsync"},
+        {50, nullptr, "RegisterNotificationTokenAsync"},   // 1.0.0 - 6.2.0
+        {51, nullptr, "UnregisterNotificationTokenAsync"}, // 1.0.0 - 6.2.0
     };
     RegisterHandlers(functions);
 }
diff --git a/src/core/hle/service/acc/acc_su.cpp b/src/core/hle/service/acc/acc_su.cpp
index 2eefc6df5..d2bb8c2c8 100644
--- a/src/core/hle/service/acc/acc_su.cpp
+++ b/src/core/hle/service/acc/acc_su.cpp
@@ -17,28 +17,28 @@ ACC_SU::ACC_SU(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
         {3, &ACC_SU::ListOpenUsers, "ListOpenUsers"},
         {4, &ACC_SU::GetLastOpenedUser, "GetLastOpenedUser"},
         {5, &ACC_SU::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
         {50, &ACC_SU::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
         {51, &ACC_SU::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, &ACC_SU::ListOpenContextStoredUsers, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
         {100, nullptr, "GetUserRegistrationNotifier"},
         {101, nullptr, "GetUserStateChangeNotifier"},
         {102, nullptr, "GetBaasAccountManagerForSystemService"},
         {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
         {104, nullptr, "GetProfileUpdateNotifier"},
-        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"},
-        {106, nullptr, "GetProfileSyncNotifier"},
+        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
+        {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+
         {110, nullptr, "StoreSaveDataThumbnail"},
         {111, nullptr, "ClearSaveDataThumbnail"},
         {112, nullptr, "LoadSaveDataThumbnail"},
-        {113, nullptr, "GetSaveDataThumbnailExistence"},
-        {120, nullptr, "ListOpenUsersInApplication"},
-        {130, nullptr, "ActivateOpenContextRetention"},
-        {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, nullptr, "AuthenticateApplicationAsync"},
-        {190, nullptr, "GetUserLastOpenedApplication"},
-        {191, nullptr, "ActivateOpenContextHolder"},
+        {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+
+        {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+
+        {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+
+        {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+
+        {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0
+        {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+
         {200, nullptr, "BeginUserRegistration"},
         {201, nullptr, "CompleteUserRegistration"},
         {202, nullptr, "CancelUserRegistration"},
@@ -46,15 +46,15 @@ ACC_SU::ACC_SU(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
         {204, nullptr, "SetUserPosition"},
         {205, &ACC_SU::GetProfileEditor, "GetProfileEditor"},
         {206, nullptr, "CompleteUserRegistrationForcibly"},
-        {210, nullptr, "CreateFloatingRegistrationRequest"},
-        {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"},
-        {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"},
+        {210, nullptr, "CreateFloatingRegistrationRequest"}, // 3.0.0+
+        {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+
+        {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+
         {230, nullptr, "AuthenticateServiceAsync"},
         {250, nullptr, "GetBaasAccountAdministrator"},
         {290, nullptr, "ProxyProcedureForGuestLoginWithNintendoAccount"},
-        {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"},
+        {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"}, // 3.0.0+
         {299, nullptr, "SuspendBackgroundDaemon"},
-        {997, nullptr, "DebugInvalidateTokenCacheForUser"},
+        {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+
         {998, nullptr, "DebugSetUserStateClose"},
         {999, nullptr, "DebugSetUserStateOpen"},
     };
diff --git a/src/core/hle/service/acc/acc_u0.cpp b/src/core/hle/service/acc/acc_u0.cpp
index fb4e7e772..cb44e06b7 100644
--- a/src/core/hle/service/acc/acc_u0.cpp
+++ b/src/core/hle/service/acc/acc_u0.cpp
@@ -17,23 +17,23 @@ ACC_U0::ACC_U0(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
         {3, &ACC_U0::ListOpenUsers, "ListOpenUsers"},
         {4, &ACC_U0::GetLastOpenedUser, "GetLastOpenedUser"},
         {5, &ACC_U0::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
         {50, &ACC_U0::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
         {51, &ACC_U0::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, &ACC_U0::ListOpenContextStoredUsers, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
         {100, &ACC_U0::InitializeApplicationInfo, "InitializeApplicationInfo"},
         {101, &ACC_U0::GetBaasAccountManagerForApplication, "GetBaasAccountManagerForApplication"},
         {102, nullptr, "AuthenticateApplicationAsync"},
-        {103, nullptr, "CheckNetworkServiceAvailabilityAsync"},
+        {103, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
         {110, nullptr, "StoreSaveDataThumbnail"},
         {111, nullptr, "ClearSaveDataThumbnail"},
         {120, nullptr, "CreateGuestLoginRequest"},
-        {130, nullptr, "LoadOpenContext"},
-        {131, nullptr, "ListOpenContextStoredUsers"},
-        {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"},
-        {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"},
+        {130, nullptr, "LoadOpenContext"}, // 5.0.0+
+        {131, &ACC_U0::ListOpenContextStoredUsers, "ListOpenContextStoredUsers"}, // 6.0.0+
+        {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"}, // 6.0.0+
+        {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"}, // 6.0.0+
     };
     // clang-format on
 
diff --git a/src/core/hle/service/acc/acc_u1.cpp b/src/core/hle/service/acc/acc_u1.cpp
index 9f29cdc82..a4aa5316a 100644
--- a/src/core/hle/service/acc/acc_u1.cpp
+++ b/src/core/hle/service/acc/acc_u1.cpp
@@ -17,28 +17,29 @@ ACC_U1::ACC_U1(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
         {3, &ACC_U1::ListOpenUsers, "ListOpenUsers"},
         {4, &ACC_U1::GetLastOpenedUser, "GetLastOpenedUser"},
         {5, &ACC_U1::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
         {50, &ACC_U1::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
         {51, &ACC_U1::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, &ACC_U1::ListOpenContextStoredUsers, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
         {100, nullptr, "GetUserRegistrationNotifier"},
         {101, nullptr, "GetUserStateChangeNotifier"},
         {102, nullptr, "GetBaasAccountManagerForSystemService"},
-        {103, nullptr, "GetProfileUpdateNotifier"},
-        {104, nullptr, "CheckNetworkServiceAvailabilityAsync"},
-        {105, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
-        {106, nullptr, "GetProfileSyncNotifier"},
+        {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
+        {104, nullptr, "GetProfileUpdateNotifier"},
+        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
+        {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+
         {110, nullptr, "StoreSaveDataThumbnail"},
         {111, nullptr, "ClearSaveDataThumbnail"},
         {112, nullptr, "LoadSaveDataThumbnail"},
-        {113, nullptr, "GetSaveDataThumbnailExistence"},
-        {130, nullptr, "ActivateOpenContextRetention"},
-        {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, nullptr, "AuthenticateApplicationAsync"},
-        {190, nullptr, "GetUserLastOpenedApplication"},
-        {191, nullptr, "ActivateOpenContextHolder"},
-        {997, nullptr, "DebugInvalidateTokenCacheForUser"},
+        {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+
+        {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+
+        {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+
+        {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+
+        {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0
+        {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+
+        {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+
         {998, nullptr, "DebugSetUserStateClose"},
         {999, nullptr, "DebugSetUserStateOpen"},
     };
diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp
index 4df74c4f9..256449aa7 100644
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@@ -10,6 +10,7 @@
 #include "core/core.h"
 #include "core/file_sys/control_metadata.h"
 #include "core/file_sys/patch_manager.h"
+#include "core/file_sys/registered_cache.h"
 #include "core/file_sys/savedata_factory.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/kernel.h"
@@ -68,6 +69,7 @@ IWindowController::IWindowController(Core::System& system_)
     static const FunctionInfo functions[] = {
         {0, nullptr, "CreateWindow"},
         {1, &IWindowController::GetAppletResourceUserId, "GetAppletResourceUserId"},
+        {2, nullptr, "GetAppletResourceUserIdOfCallerApplet"},
         {10, &IWindowController::AcquireForegroundRights, "AcquireForegroundRights"},
         {11, nullptr, "ReleaseForegroundRights"},
         {12, nullptr, "RejectToChangeIntoBackground"},
@@ -189,8 +191,8 @@ IDisplayController::IDisplayController() : ServiceFramework("IDisplayController"
         {5, nullptr, "GetLastForegroundCaptureImageEx"},
         {6, nullptr, "GetLastApplicationCaptureImageEx"},
         {7, nullptr, "GetCallerAppletCaptureImageEx"},
-        {8, nullptr, "TakeScreenShotOfOwnLayer"},  // 2.0.0+
-        {9, nullptr, "CopyBetweenCaptureBuffers"}, // 5.0.0+
+        {8, nullptr, "TakeScreenShotOfOwnLayer"},
+        {9, nullptr, "CopyBetweenCaptureBuffers"},
         {10, nullptr, "AcquireLastApplicationCaptureBuffer"},
         {11, nullptr, "ReleaseLastApplicationCaptureBuffer"},
         {12, nullptr, "AcquireLastForegroundCaptureBuffer"},
@@ -200,17 +202,14 @@ IDisplayController::IDisplayController() : ServiceFramework("IDisplayController"
         {16, nullptr, "AcquireLastApplicationCaptureBufferEx"},
         {17, nullptr, "AcquireLastForegroundCaptureBufferEx"},
         {18, nullptr, "AcquireCallerAppletCaptureBufferEx"},
-        // 2.0.0+
         {20, nullptr, "ClearCaptureBuffer"},
         {21, nullptr, "ClearAppletTransitionBuffer"},
-        // 4.0.0+
         {22, nullptr, "AcquireLastApplicationCaptureSharedBuffer"},
         {23, nullptr, "ReleaseLastApplicationCaptureSharedBuffer"},
         {24, nullptr, "AcquireLastForegroundCaptureSharedBuffer"},
         {25, nullptr, "ReleaseLastForegroundCaptureSharedBuffer"},
         {26, nullptr, "AcquireCallerAppletCaptureSharedBuffer"},
         {27, nullptr, "ReleaseCallerAppletCaptureSharedBuffer"},
-        // 6.0.0+
         {28, nullptr, "TakeScreenShotOfOwnLayerEx"},
     };
     // clang-format on
@@ -225,7 +224,7 @@ IDebugFunctions::IDebugFunctions() : ServiceFramework{"IDebugFunctions"} {
     static const FunctionInfo functions[] = {
         {0, nullptr, "NotifyMessageToHomeMenuForDebug"},
         {1, nullptr, "OpenMainApplication"},
-        {10, nullptr, "EmulateButtonEvent"},
+        {10, nullptr, "PerformSystemButtonPressing"},
         {20, nullptr, "InvalidateTransitionLayer"},
         {30, nullptr, "RequestLaunchApplicationWithUserAndArgumentForDebug"},
         {40, nullptr, "GetAppletResourceUsageInfo"},
@@ -267,13 +266,13 @@ ISelfController::ISelfController(Core::System& system,
         {16, &ISelfController::SetOutOfFocusSuspendingEnabled, "SetOutOfFocusSuspendingEnabled"},
         {17, nullptr, "SetControllerFirmwareUpdateSection"},
         {18, nullptr, "SetRequiresCaptureButtonShortPressedMessage"},
-        {19, &ISelfController::SetScreenShotImageOrientation, "SetScreenShotImageOrientation"},
+        {19, &ISelfController::SetAlbumImageOrientation, "SetAlbumImageOrientation"},
         {20, nullptr, "SetDesirableKeyboardLayout"},
         {40, &ISelfController::CreateManagedDisplayLayer, "CreateManagedDisplayLayer"},
         {41, nullptr, "IsSystemBufferSharingEnabled"},
         {42, nullptr, "GetSystemSharedLayerHandle"},
         {43, nullptr, "GetSystemSharedBufferHandle"},
-        {44, nullptr, "CreateManagedDisplaySeparableLayer"},
+        {44, &ISelfController::CreateManagedDisplaySeparableLayer, "CreateManagedDisplaySeparableLayer"},
         {45, nullptr, "SetManagedDisplayLayerSeparationMode"},
         {50, &ISelfController::SetHandlesRequestToDisplay, "SetHandlesRequestToDisplay"},
         {51, nullptr, "ApproveToDisplay"},
@@ -443,7 +442,7 @@ void ISelfController::SetOutOfFocusSuspendingEnabled(Kernel::HLERequestContext&
     rb.Push(RESULT_SUCCESS);
 }
 
-void ISelfController::SetScreenShotImageOrientation(Kernel::HLERequestContext& ctx) {
+void ISelfController::SetAlbumImageOrientation(Kernel::HLERequestContext& ctx) {
     LOG_WARNING(Service_AM, "(STUBBED) called");
 
     IPC::ResponseBuilder rb{ctx, 2};
@@ -463,6 +462,24 @@ void ISelfController::CreateManagedDisplayLayer(Kernel::HLERequestContext& ctx)
     rb.Push(*layer_id);
 }
 
+void ISelfController::CreateManagedDisplaySeparableLayer(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service_AM, "(STUBBED) called");
+
+    // TODO(Subv): Find out how AM determines the display to use, for now just
+    // create the layer in the Default display.
+    // This calls nn::vi::CreateRecordingLayer() which creates another layer.
+    // Currently we do not support more than 1 layer per display, output 1 layer id for now.
+    // Outputting 1 layer id instead of the expected 2 has not been observed to cause any adverse
+    // side effects.
+    // TODO: Support multiple layers
+    const auto display_id = nvflinger->OpenDisplay("Default");
+    const auto layer_id = nvflinger->CreateLayer(*display_id);
+
+    IPC::ResponseBuilder rb{ctx, 4};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push(*layer_id);
+}
+
 void ISelfController::SetHandlesRequestToDisplay(Kernel::HLERequestContext& ctx) {
     LOG_WARNING(Service_AM, "(STUBBED) called");
 
@@ -607,6 +624,7 @@ ICommonStateGetter::ICommonStateGetter(Core::System& system,
         {20, nullptr, "PushToGeneralChannel"},
         {30, nullptr, "GetHomeButtonReaderLockAccessor"},
         {31, nullptr, "GetReaderLockAccessorEx"},
+        {32, nullptr, "GetWriterLockAccessorEx"},
         {40, nullptr, "GetCradleFwVersion"},
         {50, &ICommonStateGetter::IsVrModeEnabled, "IsVrModeEnabled"},
         {51, &ICommonStateGetter::SetVrModeEnabled, "SetVrModeEnabled"},
@@ -731,14 +749,14 @@ void ICommonStateGetter::GetDefaultDisplayResolution(Kernel::HLERequestContext&
 
     if (Settings::values.use_docked_mode) {
         rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
         rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
     } else {
         rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
         rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
     }
 }
 
@@ -842,7 +860,7 @@ public:
             {110, nullptr, "NeedsToExitProcess"},
             {120, nullptr, "GetLibraryAppletInfo"},
             {150, nullptr, "RequestForAppletToGetForeground"},
-            {160, nullptr, "GetIndirectLayerConsumerHandle"},
+            {160, &ILibraryAppletAccessor::GetIndirectLayerConsumerHandle, "GetIndirectLayerConsumerHandle"},
         };
         // clang-format on
 
@@ -961,6 +979,18 @@ private:
         rb.PushCopyObjects(applet->GetBroker().GetInteractiveDataEvent());
     }
 
+    void GetIndirectLayerConsumerHandle(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_AM, "(STUBBED) called");
+
+        // We require a non-zero handle to be valid. Using 0xdeadbeef allows us to trace if this is
+        // actually used anywhere
+        constexpr u64 handle = 0xdeadbeef;
+
+        IPC::ResponseBuilder rb{ctx, 4};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(handle);
+    }
+
     std::shared_ptr<Applets::Applet> applet;
 };
 
@@ -1132,6 +1162,7 @@ IApplicationFunctions::IApplicationFunctions(Core::System& system_)
         {24, nullptr, "GetLaunchStorageInfoForDebug"},
         {25, &IApplicationFunctions::ExtendSaveData, "ExtendSaveData"},
         {26, &IApplicationFunctions::GetSaveDataSize, "GetSaveDataSize"},
+        {27, nullptr, "CreateCacheStorage"},
         {30, &IApplicationFunctions::BeginBlockingHomeButtonShortAndLongPressed, "BeginBlockingHomeButtonShortAndLongPressed"},
         {31, &IApplicationFunctions::EndBlockingHomeButtonShortAndLongPressed, "EndBlockingHomeButtonShortAndLongPressed"},
         {32, &IApplicationFunctions::BeginBlockingHomeButton, "BeginBlockingHomeButton"},
@@ -1157,6 +1188,8 @@ IApplicationFunctions::IApplicationFunctions(Core::System& system_)
         {120, nullptr, "ExecuteProgram"},
         {121, nullptr, "ClearUserChannel"},
         {122, nullptr, "UnpopToUserChannel"},
+        {123, nullptr, "GetPreviousProgramIndex"},
+        {124, nullptr, "EnableApplicationAllThreadDumpOnCrash"},
         {130, &IApplicationFunctions::GetGpuErrorDetectedSystemEvent, "GetGpuErrorDetectedSystemEvent"},
         {140, &IApplicationFunctions::GetFriendInvitationStorageChannelEvent, "GetFriendInvitationStorageChannelEvent"},
         {141, nullptr, "TryPopFromFriendInvitationStorageChannel"},
@@ -1339,14 +1372,25 @@ void IApplicationFunctions::GetDisplayVersion(Kernel::HLERequestContext& ctx) {
 
     std::array<u8, 0x10> version_string{};
 
-    FileSys::PatchManager pm{system.CurrentProcess()->GetTitleID()};
-    const auto res = pm.GetControlMetadata();
+    const auto res = [this] {
+        const auto title_id = system.CurrentProcess()->GetTitleID();
+
+        FileSys::PatchManager pm{title_id};
+        auto res = pm.GetControlMetadata();
+        if (res.first != nullptr) {
+            return res;
+        }
+
+        FileSys::PatchManager pm_update{FileSys::GetUpdateTitleID(title_id)};
+        return pm_update.GetControlMetadata();
+    }();
+
     if (res.first != nullptr) {
         const auto& version = res.first->GetVersionString();
         std::copy(version.begin(), version.end(), version_string.begin());
     } else {
-        constexpr u128 default_version = {1, 0};
-        std::memcpy(version_string.data(), default_version.data(), sizeof(u128));
+        constexpr char default_version[]{"1.0.0"};
+        std::memcpy(version_string.data(), default_version, sizeof(default_version));
     }
 
     IPC::ResponseBuilder rb{ctx, 6};
diff --git a/src/core/hle/service/am/am.h b/src/core/hle/service/am/am.h
index 469f7f814..6cfb11b48 100644
--- a/src/core/hle/service/am/am.h
+++ b/src/core/hle/service/am/am.h
@@ -138,8 +138,9 @@ private:
     void SetFocusHandlingMode(Kernel::HLERequestContext& ctx);
     void SetRestartMessageEnabled(Kernel::HLERequestContext& ctx);
     void SetOutOfFocusSuspendingEnabled(Kernel::HLERequestContext& ctx);
-    void SetScreenShotImageOrientation(Kernel::HLERequestContext& ctx);
+    void SetAlbumImageOrientation(Kernel::HLERequestContext& ctx);
     void CreateManagedDisplayLayer(Kernel::HLERequestContext& ctx);
+    void CreateManagedDisplaySeparableLayer(Kernel::HLERequestContext& ctx);
     void SetHandlesRequestToDisplay(Kernel::HLERequestContext& ctx);
     void SetIdleTimeDetectionExtension(Kernel::HLERequestContext& ctx);
     void GetIdleTimeDetectionExtension(Kernel::HLERequestContext& ctx);
diff --git a/src/core/hle/service/am/applets/software_keyboard.cpp b/src/core/hle/service/am/applets/software_keyboard.cpp
index 54e63c138..fbe3686ae 100644
--- a/src/core/hle/service/am/applets/software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/software_keyboard.cpp
@@ -30,7 +30,7 @@ static Core::Frontend::SoftwareKeyboardParameters ConvertToFrontendParameters(
                                                                        config.sub_text.size());
     params.guide_text = Common::UTF16StringFromFixedZeroTerminatedBuffer(config.guide_text.data(),
                                                                          config.guide_text.size());
-    params.initial_text = initial_text;
+    params.initial_text = std::move(initial_text);
     params.max_length = config.length_limit == 0 ? DEFAULT_MAX_LENGTH : config.length_limit;
     params.password = static_cast<bool>(config.is_password);
     params.cursor_at_beginning = static_cast<bool>(config.initial_cursor_position);
@@ -60,7 +60,7 @@ void SoftwareKeyboard::Initialize() {
     std::memcpy(&config, keyboard_config.data(), sizeof(KeyboardConfig));
 
     const auto work_buffer_storage = broker.PopNormalDataToApplet();
-    ASSERT(work_buffer_storage != nullptr);
+    ASSERT_OR_EXECUTE(work_buffer_storage != nullptr, { return; });
     const auto& work_buffer = work_buffer_storage->GetData();
 
     if (config.initial_string_size == 0)
@@ -109,7 +109,7 @@ void SoftwareKeyboard::Execute() {
 
     const auto parameters = ConvertToFrontendParameters(config, initial_text);
 
-    frontend.RequestText([this](std::optional<std::u16string> text) { WriteText(text); },
+    frontend.RequestText([this](std::optional<std::u16string> text) { WriteText(std::move(text)); },
                          parameters);
 }
 
diff --git a/src/core/hle/service/am/spsm.cpp b/src/core/hle/service/am/spsm.cpp
index 003ee8667..f27729ce7 100644
--- a/src/core/hle/service/am/spsm.cpp
+++ b/src/core/hle/service/am/spsm.cpp
@@ -10,17 +10,17 @@ SPSM::SPSM() : ServiceFramework{"spsm"} {
     // clang-format off
     static const FunctionInfo functions[] = {
         {0, nullptr, "GetState"},
-        {1, nullptr, "SleepSystemAndWaitAwake"},
-        {2, nullptr, "Unknown1"},
-        {3, nullptr, "Unknown2"},
+        {1, nullptr, "EnterSleep"},
+        {2, nullptr, "GetLastWakeReason"},
+        {3, nullptr, "Shutdown"},
         {4, nullptr, "GetNotificationMessageEventHandle"},
-        {5, nullptr, "Unknown3"},
-        {6, nullptr, "Unknown4"},
-        {7, nullptr, "Unknown5"},
+        {5, nullptr, "ReceiveNotificationMessage"},
+        {6, nullptr, "AnalyzeLogForLastSleepWakeSequence"},
+        {7, nullptr, "ResetEventLog"},
         {8, nullptr, "AnalyzePerformanceLogForLastSleepWakeSequence"},
         {9, nullptr, "ChangeHomeButtonLongPressingTime"},
-        {10, nullptr, "Unknown6"},
-        {11, nullptr, "Unknown7"},
+        {10, nullptr, "PutErrorState"},
+        {11, nullptr, "InvalidateCurrentHomeButtonPressing"},
     };
     // clang-format on
 
diff --git a/src/core/hle/service/aoc/aoc_u.cpp b/src/core/hle/service/aoc/aoc_u.cpp
index 4227a4adf..8e79f707b 100644
--- a/src/core/hle/service/aoc/aoc_u.cpp
+++ b/src/core/hle/service/aoc/aoc_u.cpp
@@ -60,6 +60,7 @@ AOC_U::AOC_U(Core::System& system)
         {6, nullptr, "PrepareAddOnContentByApplicationId"},
         {7, &AOC_U::PrepareAddOnContent, "PrepareAddOnContent"},
         {8, &AOC_U::GetAddOnContentListChangedEvent, "GetAddOnContentListChangedEvent"},
+        {9, nullptr, "GetAddOnContentLostErrorCode"},
         {100, nullptr, "CreateEcPurchasedEventManager"},
         {101, nullptr, "CreatePermanentEcPurchasedEventManager"},
     };
diff --git a/src/core/hle/service/bcat/bcat.cpp b/src/core/hle/service/bcat/bcat.cpp
index 8bb2528c9..b31766212 100644
--- a/src/core/hle/service/bcat/bcat.cpp
+++ b/src/core/hle/service/bcat/bcat.cpp
@@ -14,6 +14,8 @@ BCAT::BCAT(Core::System& system, std::shared_ptr<Module> module,
         {0, &BCAT::CreateBcatService, "CreateBcatService"},
         {1, &BCAT::CreateDeliveryCacheStorageService, "CreateDeliveryCacheStorageService"},
         {2, &BCAT::CreateDeliveryCacheStorageServiceWithApplicationId, "CreateDeliveryCacheStorageServiceWithApplicationId"},
+        {3, nullptr, "CreateDeliveryCacheProgressService"},
+        {4, nullptr, "CreateDeliveryCacheProgressServiceWithApplicationId"},
     };
     // clang-format on
     RegisterHandlers(functions);
diff --git a/src/core/hle/service/bcat/module.cpp b/src/core/hle/service/bcat/module.cpp
index 34aba7a27..603b64d4f 100644
--- a/src/core/hle/service/bcat/module.cpp
+++ b/src/core/hle/service/bcat/module.cpp
@@ -143,10 +143,13 @@ public:
             {20401, nullptr, "UnregisterSystemApplicationDeliveryTask"},
             {20410, nullptr, "SetSystemApplicationDeliveryTaskTimer"},
             {30100, &IBcatService::SetPassphrase, "SetPassphrase"},
+            {30101, nullptr, "Unknown"},
+            {30102, nullptr, "Unknown2"},
             {30200, nullptr, "RegisterBackgroundDeliveryTask"},
             {30201, nullptr, "UnregisterBackgroundDeliveryTask"},
             {30202, nullptr, "BlockDeliveryTask"},
             {30203, nullptr, "UnblockDeliveryTask"},
+            {30210, nullptr, "SetDeliveryTaskTimer"},
             {30300, nullptr, "RegisterSystemApplicationDeliveryTasks"},
             {90100, nullptr, "EnumerateBackgroundDeliveryTask"},
             {90200, nullptr, "GetDeliveryList"},
diff --git a/src/core/hle/service/bpc/bpc.cpp b/src/core/hle/service/bpc/bpc.cpp
index 1c1ecdb60..fac6b2f9c 100644
--- a/src/core/hle/service/bpc/bpc.cpp
+++ b/src/core/hle/service/bpc/bpc.cpp
@@ -23,9 +23,14 @@ public:
             {5, nullptr, "GetBoardPowerControlEvent"},
             {6, nullptr, "GetSleepButtonState"},
             {7, nullptr, "GetPowerEvent"},
-            {8, nullptr, "Unknown1"},
-            {9, nullptr, "Unknown2"},
-            {10, nullptr, "Unknown3"},
+            {8, nullptr, "CreateWakeupTimer"},
+            {9, nullptr, "CancelWakeupTimer"},
+            {10, nullptr, "EnableWakeupTimerOnDevice"},
+            {11, nullptr, "CreateWakeupTimerEx"},
+            {12, nullptr, "GetLastEnabledWakeupTimerType"},
+            {13, nullptr, "CleanAllWakeupTimers"},
+            {14, nullptr, "Unknown"},
+            {15, nullptr, "Unknown2"},
         };
         // clang-format on
 
@@ -38,10 +43,11 @@ public:
     explicit BPC_R() : ServiceFramework{"bpc:r"} {
         // clang-format off
         static const FunctionInfo functions[] = {
-            {0, nullptr, "GetExternalRtcValue"},
-            {1, nullptr, "SetExternalRtcValue"},
-            {2, nullptr, "ReadExternalRtcResetFlag"},
-            {3, nullptr, "ClearExternalRtcResetFlag"},
+            {0, nullptr, "GetRtcTime"},
+            {1, nullptr, "SetRtcTime"},
+            {2, nullptr, "GetRtcResetDetected"},
+            {3, nullptr, "ClearRtcResetDetected"},
+            {4, nullptr, "SetUpRtcResetOnShutdown"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/btdrv/btdrv.cpp b/src/core/hle/service/btdrv/btdrv.cpp
index 40a06c9fd..f311afa2f 100644
--- a/src/core/hle/service/btdrv/btdrv.cpp
+++ b/src/core/hle/service/btdrv/btdrv.cpp
@@ -58,102 +58,103 @@ public:
             {1, nullptr, "InitializeBluetooth"},
             {2, nullptr, "EnableBluetooth"},
             {3, nullptr, "DisableBluetooth"},
-            {4, nullptr, "CleanupBluetooth"},
+            {4, nullptr, "FinalizeBluetooth"},
             {5, nullptr, "GetAdapterProperties"},
             {6, nullptr, "GetAdapterProperty"},
             {7, nullptr, "SetAdapterProperty"},
-            {8, nullptr, "StartDiscovery"},
-            {9, nullptr, "CancelDiscovery"},
+            {8, nullptr, "StartInquiry"},
+            {9, nullptr, "StopInquiry"},
             {10, nullptr, "CreateBond"},
             {11, nullptr, "RemoveBond"},
             {12, nullptr, "CancelBond"},
-            {13, nullptr, "PinReply"},
-            {14, nullptr, "SspReply"},
+            {13, nullptr, "RespondToPinRequest"},
+            {14, nullptr, "RespondToSspRequest"},
             {15, nullptr, "GetEventInfo"},
             {16, nullptr, "InitializeHid"},
-            {17, nullptr, "HidConnect"},
-            {18, nullptr, "HidDisconnect"},
-            {19, nullptr, "HidSendData"},
-            {20, nullptr, "HidSendData2"},
-            {21, nullptr, "HidSetReport"},
-            {22, nullptr, "HidGetReport"},
-            {23, nullptr, "HidWakeController"},
-            {24, nullptr, "HidAddPairedDevice"},
-            {25, nullptr, "HidGetPairedDevice"},
-            {26, nullptr, "CleanupHid"},
-            {27, nullptr, "HidGetEventInfo"},
-            {28, nullptr, "ExtSetTsi"},
-            {29, nullptr, "ExtSetBurstMode"},
-            {30, nullptr, "ExtSetZeroRetran"},
-            {31, nullptr, "ExtSetMcMode"},
-            {32, nullptr, "ExtStartLlrMode"},
-            {33, nullptr, "ExtExitLlrMode"},
-            {34, nullptr, "ExtSetRadio"},
-            {35, nullptr, "ExtSetVisibility"},
-            {36, nullptr, "ExtSetTbfcScan"},
+            {17, nullptr, "OpenHidConnection"},
+            {18, nullptr, "CloseHidConnection"},
+            {19, nullptr, "WriteHidData"},
+            {20, nullptr, "WriteHidData2"},
+            {21, nullptr, "SetHidReport"},
+            {22, nullptr, "GetHidReport"},
+            {23, nullptr, "TriggerConnection"},
+            {24, nullptr, "AddPairedDeviceInfo"},
+            {25, nullptr, "GetPairedDeviceInfo"},
+            {26, nullptr, "FinalizeHid"},
+            {27, nullptr, "GetHidEventInfo"},
+            {28, nullptr, "SetTsi"},
+            {29, nullptr, "EnableBurstMode"},
+            {30, nullptr, "SetZeroRetransmission"},
+            {31, nullptr, "EnableMcMode"},
+            {32, nullptr, "EnableLlrScan"},
+            {33, nullptr, "DisableLlrScan"},
+            {34, nullptr, "EnableRadio"},
+            {35, nullptr, "SetVisibility"},
+            {36, nullptr, "EnableTbfcScan"},
             {37, nullptr, "RegisterHidReportEvent"},
-            {38, nullptr, "HidGetReportEventInfo"},
+            {38, nullptr, "GetHidReportEventInfo"},
             {39, nullptr, "GetLatestPlr"},
-            {40, nullptr, "ExtGetPendingConnections"},
+            {40, nullptr, "GetPendingConnections"},
             {41, nullptr, "GetChannelMap"},
-            {42, nullptr, "EnableBluetoothBoostSetting"},
-            {43, nullptr, "IsBluetoothBoostSettingEnabled"},
-            {44, nullptr, "EnableBluetoothAfhSetting"},
-            {45, nullptr, "IsBluetoothAfhSettingEnabled"},
-            {46, nullptr, "InitializeBluetoothLe"},
-            {47, nullptr, "EnableBluetoothLe"},
-            {48, nullptr, "DisableBluetoothLe"},
-            {49, nullptr, "CleanupBluetoothLe"},
-            {50, nullptr, "SetLeVisibility"},
-            {51, nullptr, "SetLeConnectionParameter"},
-            {52, nullptr, "SetLeDefaultConnectionParameter"},
-            {53, nullptr, "SetLeAdvertiseData"},
-            {54, nullptr, "SetLeAdvertiseParameter"},
-            {55, nullptr, "StartLeScan"},
-            {56, nullptr, "StopLeScan"},
-            {57, nullptr, "AddLeScanFilterCondition"},
-            {58, nullptr, "DeleteLeScanFilterCondition"},
-            {59, nullptr, "DeleteLeScanFilter"},
-            {60, nullptr, "ClearLeScanFilters"},
-            {61, nullptr, "EnableLeScanFilter"},
-            {62, nullptr, "RegisterLeClient"},
-            {63, nullptr, "UnregisterLeClient"},
-            {64, nullptr, "UnregisterLeClientAll"},
-            {65, nullptr, "LeClientConnect"},
-            {66, nullptr, "LeClientCancelConnection"},
-            {67, nullptr, "LeClientDisconnect"},
-            {68, nullptr, "LeClientGetAttributes"},
-            {69, nullptr, "LeClientDiscoverService"},
-            {70, nullptr, "LeClientConfigureMtu"},
-            {71, nullptr, "RegisterLeServer"},
-            {72, nullptr, "UnregisterLeServer"},
-            {73, nullptr, "LeServerConnect"},
-            {74, nullptr, "LeServerDisconnect"},
-            {75, nullptr, "CreateLeService"},
-            {76, nullptr, "StartLeService"},
-            {77, nullptr, "AddLeCharacteristic"},
-            {78, nullptr, "AddLeDescriptor"},
-            {79, nullptr, "GetLeCoreEventInfo"},
-            {80, nullptr, "LeGetFirstCharacteristic"},
-            {81, nullptr, "LeGetNextCharacteristic"},
-            {82, nullptr, "LeGetFirstDescriptor"},
-            {83, nullptr, "LeGetNextDescriptor"},
-            {84, nullptr, "RegisterLeCoreDataPath"},
-            {85, nullptr, "UnregisterLeCoreDataPath"},
-            {86, nullptr, "RegisterLeHidDataPath"},
-            {87, nullptr, "UnregisterLeHidDataPath"},
-            {88, nullptr, "RegisterLeDataPath"},
-            {89, nullptr, "UnregisterLeDataPath"},
-            {90, nullptr, "LeClientReadCharacteristic"},
-            {91, nullptr, "LeClientReadDescriptor"},
-            {92, nullptr, "LeClientWriteCharacteristic"},
-            {93, nullptr, "LeClientWriteDescriptor"},
-            {94, nullptr, "LeClientRegisterNotification"},
-            {95, nullptr, "LeClientDeregisterNotification"},
+            {42, nullptr, "EnableTxPowerBoostSetting"},
+            {43, nullptr, "IsTxPowerBoostSettingEnabled"},
+            {44, nullptr, "EnableAfhSetting"},
+            {45, nullptr, "IsAfhSettingEnabled"},
+            {46, nullptr, "InitializeBle"},
+            {47, nullptr, "EnableBle"},
+            {48, nullptr, "DisableBle"},
+            {49, nullptr, "FinalizeBle"},
+            {50, nullptr, "SetBleVisibility"},
+            {51, nullptr, "SetBleConnectionParameter"},
+            {52, nullptr, "SetBleDefaultConnectionParameter"},
+            {53, nullptr, "SetBleAdvertiseData"},
+            {54, nullptr, "SetBleAdvertiseParameter"},
+            {55, nullptr, "StartBleScan"},
+            {56, nullptr, "StopBleScan"},
+            {57, nullptr, "AddBleScanFilterCondition"},
+            {58, nullptr, "DeleteBleScanFilterCondition"},
+            {59, nullptr, "DeleteBleScanFilter"},
+            {60, nullptr, "ClearBleScanFilters"},
+            {61, nullptr, "EnableBleScanFilter"},
+            {62, nullptr, "RegisterGattClient"},
+            {63, nullptr, "UnregisterGattClient"},
+            {64, nullptr, "UnregisterAllGattClients"},
+            {65, nullptr, "ConnectGattServer"},
+            {66, nullptr, "CancelConnectGattServer"},
+            {67, nullptr, "DisconnectGattServer"},
+            {68, nullptr, "GetGattAttribute"},
+            {69, nullptr, "GetGattService"},
+            {70, nullptr, "ConfigureAttMtu"},
+            {71, nullptr, "RegisterGattServer"},
+            {72, nullptr, "UnregisterGattServer"},
+            {73, nullptr, "ConnectGattClient"},
+            {74, nullptr, "DisconnectGattClient"},
+            {75, nullptr, "AddGattService"},
+            {76, nullptr, "EnableGattService"},
+            {77, nullptr, "AddGattCharacteristic"},
+            {78, nullptr, "AddGattDescriptor"},
+            {79, nullptr, "GetBleManagedEventInfo"},
+            {80, nullptr, "GetGattFirstCharacteristic"},
+            {81, nullptr, "GetGattNextCharacteristic"},
+            {82, nullptr, "GetGattFirstDescriptor"},
+            {83, nullptr, "GetGattNextDescriptor"},
+            {84, nullptr, "RegisterGattManagedDataPath"},
+            {85, nullptr, "UnregisterGattManagedDataPath"},
+            {86, nullptr, "RegisterGattHidDataPath"},
+            {87, nullptr, "UnregisterGattHidDataPath"},
+            {88, nullptr, "RegisterGattDataPath"},
+            {89, nullptr, "UnregisterGattDataPath"},
+            {90, nullptr, "ReadGattCharacteristic"},
+            {91, nullptr, "ReadGattDescriptor"},
+            {92, nullptr, "WriteGattCharacteristic"},
+            {93, nullptr, "WriteGattDescriptor"},
+            {94, nullptr, "RegisterGattNotification"},
+            {95, nullptr, "UnregisterGattNotification"},
             {96, nullptr, "GetLeHidEventInfo"},
             {97, nullptr, "RegisterBleHidEvent"},
-            {98, nullptr, "SetLeScanParameter"},
-            {256, nullptr, "GetIsManufacturingMode"},
+            {98, nullptr, "SetBleScanParameter"},
+            {99, nullptr, "MoveToSecondaryPiconet"},
+            {256, nullptr, "IsManufacturingMode"},
             {257, nullptr, "EmulateBluetoothCrash"},
             {258, nullptr, "GetBleChannelMap"},
         };
diff --git a/src/core/hle/service/btm/btm.cpp b/src/core/hle/service/btm/btm.cpp
index 251b3c9df..0d251c6d0 100644
--- a/src/core/hle/service/btm/btm.cpp
+++ b/src/core/hle/service/btm/btm.cpp
@@ -132,66 +132,71 @@ public:
     explicit BTM() : ServiceFramework{"btm"} {
         // clang-format off
         static const FunctionInfo functions[] = {
-            {0, nullptr, "Unknown1"},
-            {1, nullptr, "Unknown2"},
-            {2, nullptr, "RegisterSystemEventForConnectedDeviceCondition"},
-            {3, nullptr, "Unknown3"},
-            {4, nullptr, "Unknown4"},
-            {5, nullptr, "Unknown5"},
-            {6, nullptr, "Unknown6"},
-            {7, nullptr, "Unknown7"},
-            {8, nullptr, "RegisterSystemEventForRegisteredDeviceInfo"},
-            {9, nullptr, "Unknown8"},
-            {10, nullptr, "Unknown9"},
-            {11, nullptr, "Unknown10"},
-            {12, nullptr, "Unknown11"},
-            {13, nullptr, "Unknown12"},
+            {0, nullptr, "GetState"},
+            {1, nullptr, "GetHostDeviceProperty"},
+            {2, nullptr, "AcquireDeviceConditionEvent"},
+            {3, nullptr, "GetDeviceCondition"},
+            {4, nullptr, "SetBurstMode"},
+            {5, nullptr, "SetSlotMode"},
+            {6, nullptr, "SetBluetoothMode"},
+            {7, nullptr, "SetWlanMode"},
+            {8, nullptr, "AcquireDeviceInfoEvent"},
+            {9, nullptr, "GetDeviceInfo"},
+            {10, nullptr, "AddDeviceInfo"},
+            {11, nullptr, "RemoveDeviceInfo"},
+            {12, nullptr, "IncreaseDeviceInfoOrder"},
+            {13, nullptr, "LlrNotify"},
             {14, nullptr, "EnableRadio"},
             {15, nullptr, "DisableRadio"},
-            {16, nullptr, "Unknown13"},
-            {17, nullptr, "Unknown14"},
-            {18, nullptr, "Unknown15"},
-            {19, nullptr, "Unknown16"},
-            {20, nullptr, "Unknown17"},
-            {21, nullptr, "Unknown18"},
-            {22, nullptr, "Unknown19"},
-            {23, nullptr, "Unknown20"},
-            {24, nullptr, "Unknown21"},
-            {25, nullptr, "Unknown22"},
-            {26, nullptr, "Unknown23"},
-            {27, nullptr, "Unknown24"},
-            {28, nullptr, "Unknown25"},
-            {29, nullptr, "Unknown26"},
-            {30, nullptr, "Unknown27"},
-            {31, nullptr, "Unknown28"},
-            {32, nullptr, "Unknown29"},
-            {33, nullptr, "Unknown30"},
-            {34, nullptr, "Unknown31"},
-            {35, nullptr, "Unknown32"},
-            {36, nullptr, "Unknown33"},
-            {37, nullptr, "Unknown34"},
-            {38, nullptr, "Unknown35"},
-            {39, nullptr, "Unknown36"},
-            {40, nullptr, "Unknown37"},
-            {41, nullptr, "Unknown38"},
-            {42, nullptr, "Unknown39"},
-            {43, nullptr, "Unknown40"},
-            {44, nullptr, "Unknown41"},
-            {45, nullptr, "Unknown42"},
-            {46, nullptr, "Unknown43"},
-            {47, nullptr, "Unknown44"},
-            {48, nullptr, "Unknown45"},
-            {49, nullptr, "Unknown46"},
-            {50, nullptr, "Unknown47"},
-            {51, nullptr, "Unknown48"},
-            {52, nullptr, "Unknown49"},
-            {53, nullptr, "Unknown50"},
-            {54, nullptr, "Unknown51"},
-            {55, nullptr, "Unknown52"},
-            {56, nullptr, "Unknown53"},
-            {57, nullptr, "Unknown54"},
-            {58, nullptr, "Unknown55"},
-            {59, nullptr, "Unknown56"},
+            {16, nullptr, "HidDisconnect"},
+            {17, nullptr, "HidSetRetransmissionMode"},
+            {18, nullptr, "AcquireAwakeReqEvent"},
+            {19, nullptr, "AcquireLlrStateEvent"},
+            {20, nullptr, "IsLlrStarted"},
+            {21, nullptr, "EnableSlotSaving"},
+            {22, nullptr, "ProtectDeviceInfo"},
+            {23, nullptr, "AcquireBleScanEvent"},
+            {24, nullptr, "GetBleScanParameterGeneral"},
+            {25, nullptr, "GetBleScanParameterSmartDevice"},
+            {26, nullptr, "StartBleScanForGeneral"},
+            {27, nullptr, "StopBleScanForGeneral"},
+            {28, nullptr, "GetBleScanResultsForGeneral"},
+            {29, nullptr, "StartBleScanForPairedDevice"},
+            {30, nullptr, "StopBleScanForPairedDevice"},
+            {31, nullptr, "StartBleScanForSmartDevice"},
+            {32, nullptr, "StopBleScanForSmartDevice"},
+            {33, nullptr, "GetBleScanResultsForSmartDevice"},
+            {34, nullptr, "AcquireBleConnectionEvent"},
+            {35, nullptr, "BleConnect"},
+            {36, nullptr, "BleOverrideConnection"},
+            {37, nullptr, "BleDisconnect"},
+            {38, nullptr, "BleGetConnectionState"},
+            {39, nullptr, "BleGetGattClientConditionList"},
+            {40, nullptr, "AcquireBlePairingEvent"},
+            {41, nullptr, "BlePairDevice"},
+            {42, nullptr, "BleUnpairDeviceOnBoth"},
+            {43, nullptr, "BleUnpairDevice"},
+            {44, nullptr, "BleGetPairedAddresses"},
+            {45, nullptr, "AcquireBleServiceDiscoveryEvent"},
+            {46, nullptr, "GetGattServices"},
+            {47, nullptr, "GetGattService"},
+            {48, nullptr, "GetGattIncludedServices"},
+            {49, nullptr, "GetBelongingService"},
+            {50, nullptr, "GetGattCharacteristics"},
+            {51, nullptr, "GetGattDescriptors"},
+            {52, nullptr, "AcquireBleMtuConfigEvent"},
+            {53, nullptr, "ConfigureBleMtu"},
+            {54, nullptr, "GetBleMtu"},
+            {55, nullptr, "RegisterBleGattDataPath"},
+            {56, nullptr, "UnregisterBleGattDataPath"},
+            {57, nullptr, "RegisterAppletResourceUserId"},
+            {58, nullptr, "UnregisterAppletResourceUserId"},
+            {59, nullptr, "SetAppletResourceUserId"},
+            {60, nullptr, "Unknown60"},
+            {61, nullptr, "Unknown61"},
+            {62, nullptr, "Unknown62"},
+            {63, nullptr, "Unknown63"},
+            {64, nullptr, "Unknown64"},
         };
         // clang-format on
 
@@ -204,19 +209,19 @@ public:
     explicit BTM_DBG() : ServiceFramework{"btm:dbg"} {
         // clang-format off
         static const FunctionInfo functions[] = {
-            {0, nullptr, "RegisterSystemEventForDiscovery"},
-            {1, nullptr, "Unknown1"},
-            {2, nullptr, "Unknown2"},
-            {3, nullptr, "Unknown3"},
-            {4, nullptr, "Unknown4"},
-            {5, nullptr, "Unknown5"},
-            {6, nullptr, "Unknown6"},
-            {7, nullptr, "Unknown7"},
-            {8, nullptr, "Unknown8"},
-            {9, nullptr, "Unknown9"},
-            {10, nullptr, "Unknown10"},
-            {11, nullptr, "Unknown11"},
-            {12, nullptr, "Unknown11"},
+            {0, nullptr, "AcquireDiscoveryEvent"},
+            {1, nullptr, "StartDiscovery"},
+            {2, nullptr, "CancelDiscovery"},
+            {3, nullptr, "GetDeviceProperty"},
+            {4, nullptr, "CreateBond"},
+            {5, nullptr, "CancelBond"},
+            {6, nullptr, "SetTsiMode"},
+            {7, nullptr, "GeneralTest"},
+            {8, nullptr, "HidConnect"},
+            {9, nullptr, "GeneralGet"},
+            {10, nullptr, "GetGattClientDisconnectionReason"},
+            {11, nullptr, "GetBleConnectionParameter"},
+            {12, nullptr, "GetBleConnectionParameterRequest"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/caps/caps.cpp b/src/core/hle/service/caps/caps.cpp
index 26c8a7081..ba5749b84 100644
--- a/src/core/hle/service/caps/caps.cpp
+++ b/src/core/hle/service/caps/caps.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu emulator team
+// Copyright 2018 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps.h b/src/core/hle/service/caps/caps.h
index fc70a4c27..b8c67b6e2 100644
--- a/src/core/hle/service/caps/caps.h
+++ b/src/core/hle/service/caps/caps.h
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu emulator team
+// Copyright 2018 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -12,73 +12,79 @@ class ServiceManager;
 
 namespace Service::Capture {
 
-enum AlbumImageOrientation {
+enum class AlbumImageOrientation {
     Orientation0 = 0,
     Orientation1 = 1,
     Orientation2 = 2,
     Orientation3 = 3,
 };
 
-enum AlbumReportOption {
+enum class AlbumReportOption {
     Disable = 0,
     Enable = 1,
 };
 
-enum ContentType : u8 {
+enum class ContentType : u8 {
     Screenshot = 0,
     Movie = 1,
     ExtraMovie = 3,
 };
 
-enum AlbumStorage : u8 {
+enum class AlbumStorage : u8 {
     NAND = 0,
     SD = 1,
 };
 
 struct AlbumFileDateTime {
-    u16 year;
-    u8 month;
-    u8 day;
-    u8 hour;
-    u8 minute;
-    u8 second;
-    u8 uid;
+    s16 year{};
+    s8 month{};
+    s8 day{};
+    s8 hour{};
+    s8 minute{};
+    s8 second{};
+    s8 uid{};
 };
+static_assert(sizeof(AlbumFileDateTime) == 0x8, "AlbumFileDateTime has incorrect size.");
 
 struct AlbumEntry {
-    u64 size;
-    u64 application_id;
-    AlbumFileDateTime datetime;
-    AlbumStorage storage;
-    ContentType content;
-    u8 padding[6];
+    u64 size{};
+    u64 application_id{};
+    AlbumFileDateTime datetime{};
+    AlbumStorage storage{};
+    ContentType content{};
+    INSERT_PADDING_BYTES(6);
 };
+static_assert(sizeof(AlbumEntry) == 0x20, "AlbumEntry has incorrect size.");
 
 struct AlbumFileEntry {
-    u64 size;
-    u64 hash;
-    AlbumFileDateTime datetime;
-    AlbumStorage storage;
-    ContentType content;
-    u8 padding[5];
-    u8 unknown;
+    u64 size{}; // Size of the entry
+    u64 hash{}; // AES256 with hardcoded key over AlbumEntry
+    AlbumFileDateTime datetime{};
+    AlbumStorage storage{};
+    ContentType content{};
+    INSERT_PADDING_BYTES(5);
+    u8 unknown{1}; // Set to 1 on official SW
 };
+static_assert(sizeof(AlbumFileEntry) == 0x20, "AlbumFileEntry has incorrect size.");
 
 struct ApplicationAlbumEntry {
-    u64 size;
-    u64 hash;
-    AlbumFileDateTime datetime;
-    AlbumStorage storage;
-    ContentType content;
-    u8 padding[5];
-    u8 unknown;
+    u64 size{}; // Size of the entry
+    u64 hash{}; // AES256 with hardcoded key over AlbumEntry
+    AlbumFileDateTime datetime{};
+    AlbumStorage storage{};
+    ContentType content{};
+    INSERT_PADDING_BYTES(5);
+    u8 unknown{1}; // Set to 1 on official SW
 };
+static_assert(sizeof(ApplicationAlbumEntry) == 0x20, "ApplicationAlbumEntry has incorrect size.");
 
 struct ApplicationAlbumFileEntry {
-    ApplicationAlbumEntry entry;
-    AlbumFileDateTime datetime;
-    u64 unknown;
+    ApplicationAlbumEntry entry{};
+    AlbumFileDateTime datetime{};
+    u64 unknown{};
 };
+static_assert(sizeof(ApplicationAlbumFileEntry) == 0x30,
+              "ApplicationAlbumFileEntry has incorrect size.");
 
 /// Registers all Capture services with the specified service manager.
 void InstallInterfaces(SM::ServiceManager& sm);
diff --git a/src/core/hle/service/caps/caps_a.cpp b/src/core/hle/service/caps/caps_a.cpp
index 88a3fdc05..a0a3b2ae3 100644
--- a/src/core/hle/service/caps/caps_a.cpp
+++ b/src/core/hle/service/caps/caps_a.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_a.h b/src/core/hle/service/caps/caps_a.h
index 8de832491..cb93aad5b 100644
--- a/src/core/hle/service/caps/caps_a.h
+++ b/src/core/hle/service/caps/caps_a.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_c.cpp b/src/core/hle/service/caps/caps_c.cpp
index ea6452ffa..ab17a187e 100644
--- a/src/core/hle/service/caps/caps_c.cpp
+++ b/src/core/hle/service/caps/caps_c.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_c.h b/src/core/hle/service/caps/caps_c.h
index d07cdb441..a9d028689 100644
--- a/src/core/hle/service/caps/caps_c.h
+++ b/src/core/hle/service/caps/caps_c.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_sc.cpp b/src/core/hle/service/caps/caps_sc.cpp
index d01a8a58e..822ee96c8 100644
--- a/src/core/hle/service/caps/caps_sc.cpp
+++ b/src/core/hle/service/caps/caps_sc.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_sc.h b/src/core/hle/service/caps/caps_sc.h
index 9ba372f7a..ac3e929ca 100644
--- a/src/core/hle/service/caps/caps_sc.h
+++ b/src/core/hle/service/caps/caps_sc.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_ss.cpp b/src/core/hle/service/caps/caps_ss.cpp
index eaa3a7494..24dc716e7 100644
--- a/src/core/hle/service/caps/caps_ss.cpp
+++ b/src/core/hle/service/caps/caps_ss.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_ss.h b/src/core/hle/service/caps/caps_ss.h
index e258a6925..450686e4f 100644
--- a/src/core/hle/service/caps/caps_ss.h
+++ b/src/core/hle/service/caps/caps_ss.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_su.cpp b/src/core/hle/service/caps/caps_su.cpp
index e8b0698e8..fffb2ecf9 100644
--- a/src/core/hle/service/caps/caps_su.cpp
+++ b/src/core/hle/service/caps/caps_su.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_su.h b/src/core/hle/service/caps/caps_su.h
index c494d7c84..62c9603a9 100644
--- a/src/core/hle/service/caps/caps_su.h
+++ b/src/core/hle/service/caps/caps_su.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/caps/caps_u.cpp b/src/core/hle/service/caps/caps_u.cpp
index 78bab6ed8..f36d8de2d 100644
--- a/src/core/hle/service/caps/caps_u.cpp
+++ b/src/core/hle/service/caps/caps_u.cpp
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -58,19 +58,25 @@ void CAPS_U::GetAlbumContentsFileListForApplication(Kernel::HLERequestContext& c
     // u8 ContentType, two s64s, and an u64 AppletResourceUserId. Returns an output u64 for total
     // output entries (which is copied to a s32 by official SW).
     IPC::RequestParser rp{ctx};
-    [[maybe_unused]] const auto application_album_file_entries = rp.PopRaw<std::array<u8, 0x30>>();
-    const auto pid = rp.Pop<s32>();
-    const auto content_type = rp.PopRaw<ContentType>();
-    [[maybe_unused]] const auto start_datetime = rp.PopRaw<AlbumFileDateTime>();
-    [[maybe_unused]] const auto end_datetime = rp.PopRaw<AlbumFileDateTime>();
-    const auto applet_resource_user_id = rp.Pop<u64>();
+    const auto pid{rp.Pop<s32>()};
+    const auto content_type{rp.PopEnum<ContentType>()};
+    const auto start_posix_time{rp.Pop<s64>()};
+    const auto end_posix_time{rp.Pop<s64>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    // TODO: Update this when we implement the album.
+    // Currently we do not have a method of accessing album entries, set this to 0 for now.
+    constexpr s32 total_entries{0};
+
     LOG_WARNING(Service_Capture,
-                "(STUBBED) called. pid={}, content_type={}, applet_resource_user_id={}", pid,
-                content_type, applet_resource_user_id);
+                "(STUBBED) called. pid={}, content_type={}, start_posix_time={}, "
+                "end_posix_time={}, applet_resource_user_id={}, total_entries={}",
+                pid, content_type, start_posix_time, end_posix_time, applet_resource_user_id,
+                total_entries);
 
     IPC::ResponseBuilder rb{ctx, 3};
     rb.Push(RESULT_SUCCESS);
-    rb.Push<s32>(0);
+    rb.Push(total_entries);
 }
 
 } // namespace Service::Capture
diff --git a/src/core/hle/service/caps/caps_u.h b/src/core/hle/service/caps/caps_u.h
index e6e0716ff..689364de4 100644
--- a/src/core/hle/service/caps/caps_u.h
+++ b/src/core/hle/service/caps/caps_u.h
@@ -1,4 +1,4 @@
-// Copyright 2020 yuzu emulator team
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
diff --git a/src/core/hle/service/es/es.cpp b/src/core/hle/service/es/es.cpp
index f8e9df4b1..a41c73c48 100644
--- a/src/core/hle/service/es/es.cpp
+++ b/src/core/hle/service/es/es.cpp
@@ -27,8 +27,8 @@ public:
             {8, &ETicket::GetTitleKey, "GetTitleKey"},
             {9, &ETicket::CountCommonTicket, "CountCommonTicket"},
             {10, &ETicket::CountPersonalizedTicket, "CountPersonalizedTicket"},
-            {11, &ETicket::ListCommonTicket, "ListCommonTicket"},
-            {12, &ETicket::ListPersonalizedTicket, "ListPersonalizedTicket"},
+            {11, &ETicket::ListCommonTicketRightsIds, "ListCommonTicketRightsIds"},
+            {12, &ETicket::ListPersonalizedTicketRightsIds, "ListPersonalizedTicketRightsIds"},
             {13, nullptr, "ListMissingPersonalizedTicket"},
             {14, &ETicket::GetCommonTicketSize, "GetCommonTicketSize"},
             {15, &ETicket::GetPersonalizedTicketSize, "GetPersonalizedTicketSize"},
@@ -55,7 +55,46 @@ public:
             {36, nullptr, "DeleteAllInactiveELicenseRequiredPersonalizedTicket"},
             {37, nullptr, "OwnTicket2"},
             {38, nullptr, "OwnTicket3"},
+            {501, nullptr, "Unknown501"},
+            {502, nullptr, "Unknown502"},
             {503, nullptr, "GetTitleKey"},
+            {504, nullptr, "Unknown504"},
+            {508, nullptr, "Unknown508"},
+            {509, nullptr, "Unknown509"},
+            {510, nullptr, "Unknown510"},
+            {511, nullptr, "Unknown511"},
+            {1001, nullptr, "Unknown1001"},
+            {1002, nullptr, "Unknown1001"},
+            {1003, nullptr, "Unknown1003"},
+            {1004, nullptr, "Unknown1004"},
+            {1005, nullptr, "Unknown1005"},
+            {1006, nullptr, "Unknown1006"},
+            {1007, nullptr, "Unknown1007"},
+            {1009, nullptr, "Unknown1009"},
+            {1010, nullptr, "Unknown1010"},
+            {1011, nullptr, "Unknown1011"},
+            {1012, nullptr, "Unknown1012"},
+            {1013, nullptr, "Unknown1013"},
+            {1014, nullptr, "Unknown1014"},
+            {1015, nullptr, "Unknown1015"},
+            {1016, nullptr, "Unknown1016"},
+            {1017, nullptr, "Unknown1017"},
+            {1018, nullptr, "Unknown1018"},
+            {1019, nullptr, "Unknown1019"},
+            {1020, nullptr, "Unknown1020"},
+            {1021, nullptr, "Unknown1021"},
+            {1501, nullptr, "Unknown1501"},
+            {1502, nullptr, "Unknown1502"},
+            {1503, nullptr, "Unknown1503"},
+            {1504, nullptr, "Unknown1504"},
+            {1505, nullptr, "Unknown1505"},
+            {2000, nullptr, "Unknown2000"},
+            {2001, nullptr, "Unknown2001"},
+            {2100, nullptr, "Unknown2100"},
+            {2501, nullptr, "Unknown2501"},
+            {2502, nullptr, "Unknown2502"},
+            {3001, nullptr, "Unknown3001"},
+            {3002, nullptr, "Unknown3002"},
         };
         // clang-format on
         RegisterHandlers(functions);
@@ -147,7 +186,7 @@ private:
         rb.Push<u32>(count);
     }
 
-    void ListCommonTicket(Kernel::HLERequestContext& ctx) {
+    void ListCommonTicketRightsIds(Kernel::HLERequestContext& ctx) {
         u32 out_entries;
         if (keys.GetCommonTickets().empty())
             out_entries = 0;
@@ -170,7 +209,7 @@ private:
         rb.Push<u32>(out_entries);
     }
 
-    void ListPersonalizedTicket(Kernel::HLERequestContext& ctx) {
+    void ListPersonalizedTicketRightsIds(Kernel::HLERequestContext& ctx) {
         u32 out_entries;
         if (keys.GetPersonalizedTickets().empty())
             out_entries = 0;
@@ -263,7 +302,7 @@ private:
         rb.Push<u64>(write_size);
     }
 
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 
 void InstallInterfaces(SM::ServiceManager& service_manager) {
diff --git a/src/core/hle/service/eupld/eupld.cpp b/src/core/hle/service/eupld/eupld.cpp
index 2df30acee..0d6d244f4 100644
--- a/src/core/hle/service/eupld/eupld.cpp
+++ b/src/core/hle/service/eupld/eupld.cpp
@@ -19,6 +19,7 @@ public:
             {1, nullptr, "ImportCrt"},
             {2, nullptr, "ImportPki"},
             {3, nullptr, "SetAutoUpload"},
+            {4, nullptr, "GetAutoUpload"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/friend/friend.cpp b/src/core/hle/service/friend/friend.cpp
index 68f259b70..b7adaffc7 100644
--- a/src/core/hle/service/friend/friend.cpp
+++ b/src/core/hle/service/friend/friend.cpp
@@ -25,9 +25,13 @@ public:
             {10101, &IFriendService::GetFriendList, "GetFriendList"},
             {10102, nullptr, "UpdateFriendInfo"},
             {10110, nullptr, "GetFriendProfileImage"},
+            {10120, nullptr, "Unknown10120"},
+            {10121, nullptr, "Unknown10121"},
             {10200, nullptr, "SendFriendRequestForApplication"},
             {10211, nullptr, "AddFacedFriendRequestForApplication"},
             {10400, &IFriendService::GetBlockedUserListIds, "GetBlockedUserListIds"},
+            {10420, nullptr, "Unknown10420"},
+            {10421, nullptr, "Unknown10421"},
             {10500, nullptr, "GetProfileList"},
             {10600, nullptr, "DeclareOpenOnlinePlaySession"},
             {10601, &IFriendService::DeclareCloseOnlinePlaySession, "DeclareCloseOnlinePlaySession"},
@@ -97,6 +101,8 @@ public:
             {30900, nullptr, "SendFriendInvitation"},
             {30910, nullptr, "ReadFriendInvitation"},
             {30911, nullptr, "ReadAllFriendInvitations"},
+            {40100, nullptr, "Unknown40100"},
+            {40400, nullptr, "Unknown40400"},
             {49900, nullptr, "DeleteNetworkServiceAccountCache"},
         };
         // clang-format on
diff --git a/src/core/hle/service/grc/grc.cpp b/src/core/hle/service/grc/grc.cpp
index 24910ac6c..401e0b208 100644
--- a/src/core/hle/service/grc/grc.cpp
+++ b/src/core/hle/service/grc/grc.cpp
@@ -17,6 +17,9 @@ public:
         static const FunctionInfo functions[] = {
             {1, nullptr, "OpenContinuousRecorder"},
             {2, nullptr, "OpenGameMovieTrimmer"},
+            {3, nullptr, "OpenOffscreenRecorder"},
+            {101, nullptr, "CreateMovieMaker"},
+            {9903, nullptr, "SetOffscreenRecordingMarker"}
         };
         // clang-format on
 
diff --git a/src/core/hle/service/hid/controllers/debug_pad.cpp b/src/core/hle/service/hid/controllers/debug_pad.cpp
index 1f2131ec8..cb35919e9 100644
--- a/src/core/hle/service/hid/controllers/debug_pad.cpp
+++ b/src/core/hle/service/hid/controllers/debug_pad.cpp
@@ -23,7 +23,7 @@ void Controller_DebugPad::OnRelease() {}
 
 void Controller_DebugPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                    std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/gesture.cpp b/src/core/hle/service/hid/controllers/gesture.cpp
index 6e990dd00..b7b7bfeae 100644
--- a/src/core/hle/service/hid/controllers/gesture.cpp
+++ b/src/core/hle/service/hid/controllers/gesture.cpp
@@ -19,7 +19,7 @@ void Controller_Gesture::OnRelease() {}
 
 void Controller_Gesture::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                   std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/keyboard.cpp b/src/core/hle/service/hid/controllers/keyboard.cpp
index 9a8d354ba..feae89525 100644
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -21,7 +21,7 @@ void Controller_Keyboard::OnRelease() {}
 
 void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                    std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/mouse.cpp b/src/core/hle/service/hid/controllers/mouse.cpp
index 93d88ea50..ac40989c5 100644
--- a/src/core/hle/service/hid/controllers/mouse.cpp
+++ b/src/core/hle/service/hid/controllers/mouse.cpp
@@ -19,7 +19,7 @@ void Controller_Mouse::OnRelease() {}
 
 void Controller_Mouse::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                 std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp
index c55d900e2..ef67ad690 100644
--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -328,7 +328,7 @@ void Controller_NPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8*
             const auto& last_entry =
                 main_controller->npad[main_controller->common.last_entry_index];
 
-            main_controller->common.timestamp = core_timing.GetTicks();
+            main_controller->common.timestamp = core_timing.GetCPUTicks();
             main_controller->common.last_entry_index =
                 (main_controller->common.last_entry_index + 1) % 17;
 
@@ -566,6 +566,14 @@ void Controller_NPad::DisconnectNPad(u32 npad_id) {
     connected_controllers[NPadIdToIndex(npad_id)].is_connected = false;
 }
 
+void Controller_NPad::SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode) {
+    gyroscope_zero_drift_mode = drift_mode;
+}
+
+Controller_NPad::GyroscopeZeroDriftMode Controller_NPad::GetGyroscopeZeroDriftMode() const {
+    return gyroscope_zero_drift_mode;
+}
+
 void Controller_NPad::StartLRAssignmentMode() {
     // Nothing internally is used for lr assignment mode. Since we have the ability to set the
     // controller types from boot, it doesn't really matter about showing a selection screen
diff --git a/src/core/hle/service/hid/controllers/npad.h b/src/core/hle/service/hid/controllers/npad.h
index 931f03430..5d4c58a43 100644
--- a/src/core/hle/service/hid/controllers/npad.h
+++ b/src/core/hle/service/hid/controllers/npad.h
@@ -58,6 +58,12 @@ public:
     };
     static_assert(sizeof(Vibration) == 0x10, "Vibration is an invalid size");
 
+    enum class GyroscopeZeroDriftMode : u32 {
+        Loose = 0,
+        Standard = 1,
+        Tight = 2,
+    };
+
     enum class NpadHoldType : u64 {
         Vertical = 0,
         Horizontal = 1,
@@ -117,6 +123,8 @@ public:
 
     void ConnectNPad(u32 npad_id);
     void DisconnectNPad(u32 npad_id);
+    void SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode);
+    GyroscopeZeroDriftMode GetGyroscopeZeroDriftMode() const;
     LedPattern GetLedPattern(u32 npad_id);
     void SetVibrationEnabled(bool can_vibrate);
     bool IsVibrationEnabled() const;
@@ -324,8 +332,8 @@ private:
     std::array<Kernel::EventPair, 10> styleset_changed_events;
     Vibration last_processed_vibration{};
     std::array<ControllerHolder, 10> connected_controllers{};
+    GyroscopeZeroDriftMode gyroscope_zero_drift_mode{GyroscopeZeroDriftMode::Standard};
     bool can_controllers_vibrate{true};
-
     std::array<ControllerPad, 10> npad_pad_states{};
     bool is_in_lr_assignment_mode{false};
     Core::System& system;
diff --git a/src/core/hle/service/hid/controllers/stubbed.cpp b/src/core/hle/service/hid/controllers/stubbed.cpp
index 9e527d176..e7483bfa2 100644
--- a/src/core/hle/service/hid/controllers/stubbed.cpp
+++ b/src/core/hle/service/hid/controllers/stubbed.cpp
@@ -23,7 +23,7 @@ void Controller_Stubbed::OnUpdate(const Core::Timing::CoreTiming& core_timing, u
     }
 
     CommonHeader header{};
-    header.timestamp = core_timing.GetTicks();
+    header.timestamp = core_timing.GetCPUTicks();
     header.total_entry_count = 17;
     header.entry_count = 0;
     header.last_entry_index = 0;
diff --git a/src/core/hle/service/hid/controllers/touchscreen.cpp b/src/core/hle/service/hid/controllers/touchscreen.cpp
index 1c6e55566..e326f8f5c 100644
--- a/src/core/hle/service/hid/controllers/touchscreen.cpp
+++ b/src/core/hle/service/hid/controllers/touchscreen.cpp
@@ -22,7 +22,7 @@ void Controller_Touchscreen::OnRelease() {}
 
 void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                       std::size_t size) {
-    shared_memory.header.timestamp = core_timing.GetTicks();
+    shared_memory.header.timestamp = core_timing.GetCPUTicks();
     shared_memory.header.total_entry_count = 17;
 
     if (!IsControllerActivated()) {
@@ -49,7 +49,7 @@ void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timin
         touch_entry.diameter_x = Settings::values.touchscreen.diameter_x;
         touch_entry.diameter_y = Settings::values.touchscreen.diameter_y;
         touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle;
-        const u64 tick = core_timing.GetTicks();
+        const u64 tick = core_timing.GetCPUTicks();
         touch_entry.delta_time = tick - last_touch;
         last_touch = tick;
         touch_entry.finger = Settings::values.touchscreen.finger;
diff --git a/src/core/hle/service/hid/controllers/xpad.cpp b/src/core/hle/service/hid/controllers/xpad.cpp
index 27511b27b..2503ef241 100644
--- a/src/core/hle/service/hid/controllers/xpad.cpp
+++ b/src/core/hle/service/hid/controllers/xpad.cpp
@@ -20,7 +20,7 @@ void Controller_XPad::OnRelease() {}
 void Controller_XPad::OnUpdate(const Core::Timing::CoreTiming& core_timing, u8* data,
                                std::size_t size) {
     for (auto& xpad_entry : shared_memory.shared_memory_entries) {
-        xpad_entry.header.timestamp = core_timing.GetTicks();
+        xpad_entry.header.timestamp = core_timing.GetCPUTicks();
         xpad_entry.header.total_entry_count = 17;
 
         if (!IsControllerActivated()) {
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index c84cb1483..e9020e0dc 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -39,11 +39,9 @@ namespace Service::HID {
 
 // Updating period for each HID device.
 // TODO(ogniK): Find actual polling rate of hid
-constexpr s64 pad_update_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 66);
-[[maybe_unused]] constexpr s64 accelerometer_update_ticks =
-    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
-[[maybe_unused]] constexpr s64 gyroscope_update_ticks =
-    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
+constexpr s64 pad_update_ticks = static_cast<s64>(1000000000 / 66);
+[[maybe_unused]] constexpr s64 accelerometer_update_ticks = static_cast<s64>(1000000000 / 100);
+[[maybe_unused]] constexpr s64 gyroscope_update_ticks = static_cast<s64>(1000000000 / 100);
 constexpr std::size_t SHARED_MEMORY_SIZE = 0x40000;
 
 IAppletResource::IAppletResource(Core::System& system)
@@ -78,8 +76,8 @@ IAppletResource::IAppletResource(Core::System& system)
 
     // Register update callbacks
     pad_update_event =
-        Core::Timing::CreateEvent("HID::UpdatePadCallback", [this](u64 userdata, s64 cycles_late) {
-            UpdateControllers(userdata, cycles_late);
+        Core::Timing::CreateEvent("HID::UpdatePadCallback", [this](u64 userdata, s64 ns_late) {
+            UpdateControllers(userdata, ns_late);
         });
 
     // TODO(shinyquagsire23): Other update callbacks? (accel, gyro?)
@@ -109,7 +107,7 @@ void IAppletResource::GetSharedMemoryHandle(Kernel::HLERequestContext& ctx) {
     rb.PushCopyObjects(shared_mem);
 }
 
-void IAppletResource::UpdateControllers(u64 userdata, s64 cycles_late) {
+void IAppletResource::UpdateControllers(u64 userdata, s64 ns_late) {
     auto& core_timing = system.CoreTiming();
 
     const bool should_reload = Settings::values.is_device_reload_pending.exchange(false);
@@ -120,7 +118,7 @@ void IAppletResource::UpdateControllers(u64 userdata, s64 cycles_late) {
         controller->OnUpdate(core_timing, shared_mem->GetPointer(), SHARED_MEMORY_SIZE);
     }
 
-    core_timing.ScheduleEvent(pad_update_ticks - cycles_late, pad_update_event);
+    core_timing.ScheduleEvent(pad_update_ticks - ns_late, pad_update_event);
 }
 
 class IActiveVibrationDeviceList final : public ServiceFramework<IActiveVibrationDeviceList> {
@@ -161,7 +159,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
         {40, nullptr, "AcquireXpadIdEventHandle"},
         {41, nullptr, "ReleaseXpadIdEventHandle"},
         {51, &Hid::ActivateXpad, "ActivateXpad"},
-        {55, nullptr, "GetXpadIds"},
+        {55, &Hid::GetXpadIDs, "GetXpadIds"},
         {56, nullptr, "ActivateJoyXpad"},
         {58, nullptr, "GetJoyXpadLifoHandle"},
         {59, nullptr, "GetJoyXpadIds"},
@@ -185,8 +183,8 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
         {77, nullptr, "GetAccelerometerPlayMode"},
         {78, nullptr, "ResetAccelerometerPlayMode"},
         {79, &Hid::SetGyroscopeZeroDriftMode, "SetGyroscopeZeroDriftMode"},
-        {80, nullptr, "GetGyroscopeZeroDriftMode"},
-        {81, nullptr, "ResetGyroscopeZeroDriftMode"},
+        {80, &Hid::GetGyroscopeZeroDriftMode, "GetGyroscopeZeroDriftMode"},
+        {81, &Hid::ResetGyroscopeZeroDriftMode, "ResetGyroscopeZeroDriftMode"},
         {82, &Hid::IsSixAxisSensorAtRest, "IsSixAxisSensorAtRest"},
         {83, nullptr, "IsFirmwareUpdateAvailableForSixAxisSensor"},
         {91, &Hid::ActivateGesture, "ActivateGesture"},
@@ -230,15 +228,15 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
         {211, nullptr, "IsVibrationDeviceMounted"},
         {300, &Hid::ActivateConsoleSixAxisSensor, "ActivateConsoleSixAxisSensor"},
         {301, &Hid::StartConsoleSixAxisSensor, "StartConsoleSixAxisSensor"},
-        {302, nullptr, "StopConsoleSixAxisSensor"},
-        {303, nullptr, "ActivateSevenSixAxisSensor"},
-        {304, nullptr, "StartSevenSixAxisSensor"},
+        {302, &Hid::StopConsoleSixAxisSensor, "StopConsoleSixAxisSensor"},
+        {303, &Hid::ActivateSevenSixAxisSensor, "ActivateSevenSixAxisSensor"},
+        {304, &Hid::StartSevenSixAxisSensor, "StartSevenSixAxisSensor"},
         {305, &Hid::StopSevenSixAxisSensor, "StopSevenSixAxisSensor"},
         {306, &Hid::InitializeSevenSixAxisSensor, "InitializeSevenSixAxisSensor"},
-        {307, nullptr, "FinalizeSevenSixAxisSensor"},
+        {307, &Hid::FinalizeSevenSixAxisSensor, "FinalizeSevenSixAxisSensor"},
         {308, nullptr, "SetSevenSixAxisSensorFusionStrength"},
         {309, nullptr, "GetSevenSixAxisSensorFusionStrength"},
-        {310, nullptr, "ResetSevenSixAxisSensorTimestamp"},
+        {310, &Hid::ResetSevenSixAxisSensorTimestamp, "ResetSevenSixAxisSensorTimestamp"},
         {400, nullptr, "IsUsbFullKeyControllerEnabled"},
         {401, nullptr, "EnableUsbFullKeyController"},
         {402, nullptr, "IsUsbFullKeyControllerConnected"},
@@ -319,6 +317,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
+void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push(0);
+}
+
 void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     const auto applet_resource_user_id{rp.Pop<u64>()};
@@ -363,6 +372,15 @@ void Hid::ActivateKeyboard(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
+void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto flags{rp.Pop<u32>()};
+    LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void Hid::ActivateGesture(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     const auto unknown{rp.Pop<u32>()};
@@ -402,15 +420,59 @@ void Hid::StartSixAxisSensor(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
+void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle,
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void Hid::SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     const auto handle{rp.Pop<u32>()};
     const auto drift_mode{rp.Pop<u32>()};
     const auto applet_resource_user_id{rp.Pop<u64>()};
 
-    LOG_WARNING(Service_HID,
-                "(STUBBED) called, handle={}, drift_mode={}, applet_resource_user_id={}", handle,
-                drift_mode, applet_resource_user_id);
+    applet_resource->GetController<Controller_NPad>(HidController::NPad)
+        .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode{drift_mode});
+
+    LOG_DEBUG(Service_HID, "called, handle={}, drift_mode={}, applet_resource_user_id={}", handle,
+              drift_mode, applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle,
+              applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(
+        static_cast<u32>(applet_resource->GetController<Controller_NPad>(HidController::NPad)
+                             .GetGyroscopeZeroDriftMode()));
+}
+
+void Hid::ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    applet_resource->GetController<Controller_NPad>(HidController::NPad)
+        .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode::Standard);
+
+    LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle,
+              applet_resource_user_id);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
@@ -821,33 +883,35 @@ void Hid::StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
-void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) {
+void Hid::StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
 
-    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}", handle);
+    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle,
+                applet_resource_user_id);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
 }
 
-void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) {
+void Hid::ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     const auto applet_resource_user_id{rp.Pop<u64>()};
-    const auto unknown{rp.Pop<u32>()};
 
-    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}, unknown={}",
-                applet_resource_user_id, unknown);
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
 }
 
-void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) {
+void Hid::StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
-    const auto unknown{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
 
-    LOG_WARNING(Service_HID, "(STUBBED) called, unknown={}", unknown);
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
@@ -871,10 +935,46 @@ void Hid::InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
     rb.Push(RESULT_SUCCESS);
 }
 
-void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) {
+void Hid::FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
-    const auto flags{rp.Pop<u32>()};
-    LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags);
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+    const auto is_palma_all_connectable{rp.Pop<bool>()};
+
+    LOG_WARNING(Service_HID,
+                "(STUBBED) called, applet_resource_user_id={}, is_palma_all_connectable={}",
+                applet_resource_user_id, is_palma_all_connectable);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto palma_boost_mode{rp.Pop<bool>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, palma_boost_mode={}", palma_boost_mode);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h
index c8ed4ad8b..6fb048360 100644
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -86,14 +86,19 @@ public:
 private:
     void CreateAppletResource(Kernel::HLERequestContext& ctx);
     void ActivateXpad(Kernel::HLERequestContext& ctx);
+    void GetXpadIDs(Kernel::HLERequestContext& ctx);
     void ActivateDebugPad(Kernel::HLERequestContext& ctx);
     void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
     void ActivateMouse(Kernel::HLERequestContext& ctx);
     void ActivateKeyboard(Kernel::HLERequestContext& ctx);
+    void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx);
     void ActivateGesture(Kernel::HLERequestContext& ctx);
     void ActivateNpadWithRevision(Kernel::HLERequestContext& ctx);
     void StartSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void StopSixAxisSensor(Kernel::HLERequestContext& ctx);
     void SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
+    void GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
+    void ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
     void IsSixAxisSensorAtRest(Kernel::HLERequestContext& ctx);
     void SetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx);
     void GetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx);
@@ -125,12 +130,15 @@ private:
     void IsVibrationPermitted(Kernel::HLERequestContext& ctx);
     void ActivateConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
     void StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void StopSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx);
-    void SetPalmaBoostMode(Kernel::HLERequestContext& ctx);
+    void StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
     void StopSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
     void InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx);
+    void FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx);
+    void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx);
+    void SetPalmaBoostMode(Kernel::HLERequestContext& ctx);
 
     std::shared_ptr<IAppletResource> applet_resource;
     Core::System& system;
diff --git a/src/core/hle/service/hid/irs.cpp b/src/core/hle/service/hid/irs.cpp
index 36ed6f7da..e82fd031b 100644
--- a/src/core/hle/service/hid/irs.cpp
+++ b/src/core/hle/service/hid/irs.cpp
@@ -98,7 +98,7 @@ void IRS::GetImageTransferProcessorState(Kernel::HLERequestContext& ctx) {
 
     IPC::ResponseBuilder rb{ctx, 5};
     rb.Push(RESULT_SUCCESS);
-    rb.PushRaw<u64>(system.CoreTiming().GetTicks());
+    rb.PushRaw<u64>(system.CoreTiming().GetCPUTicks());
     rb.PushRaw<u32>(0);
 }
 
diff --git a/src/core/hle/service/lbl/lbl.cpp b/src/core/hle/service/lbl/lbl.cpp
index e8f9f2d29..17350b403 100644
--- a/src/core/hle/service/lbl/lbl.cpp
+++ b/src/core/hle/service/lbl/lbl.cpp
@@ -47,6 +47,7 @@ public:
             {26, &LBL::EnableVrMode, "EnableVrMode"},
             {27, &LBL::DisableVrMode, "DisableVrMode"},
             {28, &LBL::IsVrModeEnabled, "IsVrModeEnabled"},
+            {29, nullptr, "IsAutoBrightnessControlSupported"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/ldn/ldn.cpp b/src/core/hle/service/ldn/ldn.cpp
index 92adde6d4..49972cd69 100644
--- a/src/core/hle/service/ldn/ldn.cpp
+++ b/src/core/hle/service/ldn/ldn.cpp
@@ -69,6 +69,7 @@ public:
             {101, nullptr, "GetNetworkInfoLatestUpdate"},
             {102, nullptr, "Scan"},
             {103, nullptr, "ScanPrivate"},
+            {104, nullptr, "SetWirelessControllerRestriction"},
             {200, nullptr, "OpenAccessPoint"},
             {201, nullptr, "CloseAccessPoint"},
             {202, nullptr, "CreateNetwork"},
diff --git a/src/core/hle/service/ldr/ldr.cpp b/src/core/hle/service/ldr/ldr.cpp
index 6ad3be1b3..64a526b9e 100644
--- a/src/core/hle/service/ldr/ldr.cpp
+++ b/src/core/hle/service/ldr/ldr.cpp
@@ -39,42 +39,61 @@ constexpr ResultCode ERROR_NOT_INITIALIZED{ErrorModule::Loader, 87};
 constexpr std::size_t MAXIMUM_LOADED_RO{0x40};
 constexpr std::size_t MAXIMUM_MAP_RETRIES{0x200};
 
+constexpr std::size_t TEXT_INDEX{0};
+constexpr std::size_t RO_INDEX{1};
+constexpr std::size_t DATA_INDEX{2};
+
+struct NRRCertification {
+    u64_le application_id_mask;
+    u64_le application_id_pattern;
+    INSERT_PADDING_BYTES(0x10);
+    std::array<u8, 0x100> public_key; // Also known as modulus
+    std::array<u8, 0x100> signature;
+};
+static_assert(sizeof(NRRCertification) == 0x220, "NRRCertification has invalid size.");
+
 struct NRRHeader {
     u32_le magic;
-    INSERT_PADDING_BYTES(12);
-    u64_le title_id_mask;
-    u64_le title_id_pattern;
-    INSERT_PADDING_BYTES(16);
-    std::array<u8, 0x100> modulus;
-    std::array<u8, 0x100> signature_1;
-    std::array<u8, 0x100> signature_2;
-    u64_le title_id;
+    u32_le certification_signature_key_generation; // 9.0.0+
+    INSERT_PADDING_WORDS(2);
+    NRRCertification certification;
+    std::array<u8, 0x100> signature;
+    u64_le application_id;
     u32_le size;
-    INSERT_PADDING_BYTES(4);
+    u8 nrr_kind; // 7.0.0+
+    INSERT_PADDING_BYTES(3);
     u32_le hash_offset;
     u32_le hash_count;
-    INSERT_PADDING_BYTES(8);
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NRRHeader) == 0x350, "NRRHeader has invalid size.");
+
+struct SegmentHeader {
+    u32_le memory_offset;
+    u32_le memory_size;
 };
-static_assert(sizeof(NRRHeader) == 0x350, "NRRHeader has incorrect size.");
+static_assert(sizeof(SegmentHeader) == 0x8, "SegmentHeader has invalid size.");
 
 struct NROHeader {
+    // Switchbrew calls this "Start" (0x10)
     INSERT_PADDING_WORDS(1);
     u32_le mod_offset;
     INSERT_PADDING_WORDS(2);
+
+    // Switchbrew calls this "Header" (0x70)
     u32_le magic;
     u32_le version;
     u32_le nro_size;
     u32_le flags;
-    u32_le text_offset;
-    u32_le text_size;
-    u32_le ro_offset;
-    u32_le ro_size;
-    u32_le rw_offset;
-    u32_le rw_size;
+    // .text, .ro, .data
+    std::array<SegmentHeader, 3> segment_headers;
     u32_le bss_size;
     INSERT_PADDING_WORDS(1);
     std::array<u8, 0x20> build_id;
-    INSERT_PADDING_BYTES(0x20);
+    u32_le dso_handle_offset;
+    INSERT_PADDING_WORDS(1);
+    // .apiInfo, .dynstr, .dynsym
+    std::array<SegmentHeader, 3> segment_headers_2;
 };
 static_assert(sizeof(NROHeader) == 0x80, "NROHeader has invalid size.");
 
@@ -91,6 +110,7 @@ struct NROInfo {
     std::size_t data_size{};
     VAddr src_addr{};
 };
+static_assert(sizeof(NROInfo) == 0x60, "NROInfo has invalid size.");
 
 class DebugMonitor final : public ServiceFramework<DebugMonitor> {
 public:
@@ -226,11 +246,11 @@ public:
             return;
         }
 
-        if (system.CurrentProcess()->GetTitleID() != header.title_id) {
+        if (system.CurrentProcess()->GetTitleID() != header.application_id) {
             LOG_ERROR(Service_LDR,
                       "Attempting to load NRR with title ID other than current process. (actual "
                       "{:016X})!",
-                      header.title_id);
+                      header.application_id);
             IPC::ResponseBuilder rb{ctx, 2};
             rb.Push(ERROR_INVALID_NRR);
             return;
@@ -348,10 +368,10 @@ public:
 
     ResultCode LoadNro(Kernel::Process* process, const NROHeader& nro_header, VAddr nro_addr,
                        VAddr start) const {
-        const VAddr text_start{start + nro_header.text_offset};
-        const VAddr ro_start{start + nro_header.ro_offset};
-        const VAddr data_start{start + nro_header.rw_offset};
-        const VAddr bss_start{data_start + nro_header.rw_size};
+        const VAddr text_start{start + nro_header.segment_headers[TEXT_INDEX].memory_offset};
+        const VAddr ro_start{start + nro_header.segment_headers[RO_INDEX].memory_offset};
+        const VAddr data_start{start + nro_header.segment_headers[DATA_INDEX].memory_offset};
+        const VAddr bss_start{data_start + nro_header.segment_headers[DATA_INDEX].memory_size};
         const VAddr bss_end_addr{
             Common::AlignUp(bss_start + nro_header.bss_size, Kernel::Memory::PageSize)};
 
@@ -360,9 +380,12 @@ public:
             system.Memory().ReadBlock(src_addr, source_data.data(), source_data.size());
             system.Memory().WriteBlock(dst_addr, source_data.data(), source_data.size());
         }};
-        CopyCode(nro_addr + nro_header.text_offset, text_start, nro_header.text_size);
-        CopyCode(nro_addr + nro_header.ro_offset, ro_start, nro_header.ro_size);
-        CopyCode(nro_addr + nro_header.rw_offset, data_start, nro_header.rw_size);
+        CopyCode(nro_addr + nro_header.segment_headers[TEXT_INDEX].memory_offset, text_start,
+                 nro_header.segment_headers[TEXT_INDEX].memory_size);
+        CopyCode(nro_addr + nro_header.segment_headers[RO_INDEX].memory_offset, ro_start,
+                 nro_header.segment_headers[RO_INDEX].memory_size);
+        CopyCode(nro_addr + nro_header.segment_headers[DATA_INDEX].memory_offset, data_start,
+                 nro_header.segment_headers[DATA_INDEX].memory_size);
 
         CASCADE_CODE(process->PageTable().SetCodeMemoryPermission(
             text_start, ro_start - text_start, Kernel::Memory::MemoryPermission::ReadAndExecute));
@@ -484,9 +507,11 @@ public:
         }
 
         // Track the loaded NRO
-        nro.insert_or_assign(*map_result, NROInfo{hash, *map_result, nro_size, bss_address,
-                                                  bss_size, header.text_size, header.ro_size,
-                                                  header.rw_size, nro_address});
+        nro.insert_or_assign(*map_result,
+                             NROInfo{hash, *map_result, nro_size, bss_address, bss_size,
+                                     header.segment_headers[TEXT_INDEX].memory_size,
+                                     header.segment_headers[RO_INDEX].memory_size,
+                                     header.segment_headers[DATA_INDEX].memory_size, nro_address});
 
         // Invalidate JIT caches for the newly mapped process code
         system.InvalidateCpuInstructionCaches();
@@ -584,11 +609,21 @@ private:
     static bool IsValidNRO(const NROHeader& header, u64 nro_size, u64 bss_size) {
         return header.magic == Common::MakeMagic('N', 'R', 'O', '0') &&
                header.nro_size == nro_size && header.bss_size == bss_size &&
-               header.ro_offset == header.text_offset + header.text_size &&
-               header.rw_offset == header.ro_offset + header.ro_size &&
-               nro_size == header.rw_offset + header.rw_size &&
-               Common::Is4KBAligned(header.text_size) && Common::Is4KBAligned(header.ro_size) &&
-               Common::Is4KBAligned(header.rw_size);
+
+               header.segment_headers[RO_INDEX].memory_offset ==
+                   header.segment_headers[TEXT_INDEX].memory_offset +
+                       header.segment_headers[TEXT_INDEX].memory_size &&
+
+               header.segment_headers[DATA_INDEX].memory_offset ==
+                   header.segment_headers[RO_INDEX].memory_offset +
+                       header.segment_headers[RO_INDEX].memory_size &&
+
+               nro_size == header.segment_headers[DATA_INDEX].memory_offset +
+                               header.segment_headers[DATA_INDEX].memory_size &&
+
+               Common::Is4KBAligned(header.segment_headers[TEXT_INDEX].memory_size) &&
+               Common::Is4KBAligned(header.segment_headers[RO_INDEX].memory_size) &&
+               Common::Is4KBAligned(header.segment_headers[DATA_INDEX].memory_size);
     }
     Core::System& system;
 };
diff --git a/src/core/hle/service/lm/manager.cpp b/src/core/hle/service/lm/manager.cpp
index b67081b86..3ee2374e7 100644
--- a/src/core/hle/service/lm/manager.cpp
+++ b/src/core/hle/service/lm/manager.cpp
@@ -86,7 +86,8 @@ std::string FormatField(Field type, const std::vector<u8>& data) {
         return Common::StringFromFixedZeroTerminatedBuffer(
             reinterpret_cast<const char*>(data.data()), data.size());
     default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented field type={}", type);
+        return "";
     }
 }
 
diff --git a/src/core/hle/service/mig/mig.cpp b/src/core/hle/service/mig/mig.cpp
index d16367f2c..113a4665c 100644
--- a/src/core/hle/service/mig/mig.cpp
+++ b/src/core/hle/service/mig/mig.cpp
@@ -20,6 +20,12 @@ public:
             {101, nullptr, "ResumeServer"},
             {200, nullptr, "CreateClient"},
             {201, nullptr, "ResumeClient"},
+            {1001, nullptr, "Unknown1001"},
+            {1010, nullptr, "Unknown1010"},
+            {1100, nullptr, "Unknown1100"},
+            {1101, nullptr, "Unknown1101"},
+            {1200, nullptr, "Unknown1200"},
+            {1201, nullptr, "Unknown1201"}
         };
         // clang-format on
 
diff --git a/src/core/hle/service/mm/mm_u.cpp b/src/core/hle/service/mm/mm_u.cpp
index def63dc8a..25c24e537 100644
--- a/src/core/hle/service/mm/mm_u.cpp
+++ b/src/core/hle/service/mm/mm_u.cpp
@@ -14,14 +14,14 @@ public:
     explicit MM_U() : ServiceFramework{"mm:u"} {
         // clang-format off
         static const FunctionInfo functions[] = {
-            {0, &MM_U::Initialize, "Initialize"},
-            {1, &MM_U::Finalize, "Finalize"},
-            {2, &MM_U::SetAndWait, "SetAndWait"},
-            {3, &MM_U::Get, "Get"},
-            {4, &MM_U::InitializeWithId, "InitializeWithId"},
-            {5, &MM_U::FinalizeWithId, "FinalizeWithId"},
-            {6, &MM_U::SetAndWaitWithId, "SetAndWaitWithId"},
-            {7, &MM_U::GetWithId, "GetWithId"},
+            {0, &MM_U::InitializeOld, "InitializeOld"},
+            {1, &MM_U::FinalizeOld, "FinalizeOld"},
+            {2, &MM_U::SetAndWaitOld, "SetAndWaitOld"},
+            {3, &MM_U::GetOld, "GetOld"},
+            {4, &MM_U::Initialize, "Initialize"},
+            {5, &MM_U::Finalize, "Finalize"},
+            {6, &MM_U::SetAndWait, "SetAndWait"},
+            {7, &MM_U::Get, "Get"},
         };
         // clang-format on
 
@@ -29,21 +29,21 @@ public:
     }
 
 private:
-    void Initialize(Kernel::HLERequestContext& ctx) {
+    void InitializeOld(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
     }
 
-    void Finalize(Kernel::HLERequestContext& ctx) {
+    void FinalizeOld(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
     }
 
-    void SetAndWait(Kernel::HLERequestContext& ctx) {
+    void SetAndWaitOld(Kernel::HLERequestContext& ctx) {
         IPC::RequestParser rp{ctx};
         min = rp.Pop<u32>();
         max = rp.Pop<u32>();
@@ -54,7 +54,7 @@ private:
         rb.Push(RESULT_SUCCESS);
     }
 
-    void Get(Kernel::HLERequestContext& ctx) {
+    void GetOld(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 3};
@@ -62,7 +62,7 @@ private:
         rb.Push(current);
     }
 
-    void InitializeWithId(Kernel::HLERequestContext& ctx) {
+    void Initialize(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 3};
@@ -70,14 +70,14 @@ private:
         rb.Push<u32>(id); // Any non zero value
     }
 
-    void FinalizeWithId(Kernel::HLERequestContext& ctx) {
+    void Finalize(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
     }
 
-    void SetAndWaitWithId(Kernel::HLERequestContext& ctx) {
+    void SetAndWait(Kernel::HLERequestContext& ctx) {
         IPC::RequestParser rp{ctx};
         u32 input_id = rp.Pop<u32>();
         min = rp.Pop<u32>();
@@ -90,7 +90,7 @@ private:
         rb.Push(RESULT_SUCCESS);
     }
 
-    void GetWithId(Kernel::HLERequestContext& ctx) {
+    void Get(Kernel::HLERequestContext& ctx) {
         LOG_WARNING(Service_MM, "(STUBBED) called");
 
         IPC::ResponseBuilder rb{ctx, 3};
diff --git a/src/core/hle/service/ncm/ncm.cpp b/src/core/hle/service/ncm/ncm.cpp
index ec9aae04a..e38dea1f4 100644
--- a/src/core/hle/service/ncm/ncm.cpp
+++ b/src/core/hle/service/ncm/ncm.cpp
@@ -28,16 +28,16 @@ public:
             {7, nullptr, "ResolveApplicationLegalInformationPath"},
             {8, nullptr, "RedirectApplicationLegalInformationPath"},
             {9, nullptr, "Refresh"},
-            {10, nullptr, "RedirectProgramPath2"},
-            {11, nullptr, "Refresh2"},
-            {12, nullptr, "DeleteProgramPath"},
-            {13, nullptr, "DeleteApplicationControlPath"},
-            {14, nullptr, "DeleteApplicationHtmlDocumentPath"},
-            {15, nullptr, "DeleteApplicationLegalInformationPath"},
-            {16, nullptr, ""},
-            {17, nullptr, ""},
-            {18, nullptr, ""},
-            {19, nullptr, ""},
+            {10, nullptr, "RedirectApplicationProgramPath"},
+            {11, nullptr, "ClearApplicationRedirection"},
+            {12, nullptr, "EraseProgramRedirection"},
+            {13, nullptr, "EraseApplicationControlRedirection"},
+            {14, nullptr, "EraseApplicationHtmlDocumentRedirection"},
+            {15, nullptr, "EraseApplicationLegalInformationRedirection"},
+            {16, nullptr, "ResolveProgramPathForDebug"},
+            {17, nullptr, "RedirectProgramPathForDebug"},
+            {18, nullptr, "RedirectApplicationProgramPathForDebug"},
+            {19, nullptr, "EraseProgramRedirectionForDebug"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/nfc/nfc.cpp b/src/core/hle/service/nfc/nfc.cpp
index b7b34ce7e..780ea30fe 100644
--- a/src/core/hle/service/nfc/nfc.cpp
+++ b/src/core/hle/service/nfc/nfc.cpp
@@ -198,9 +198,9 @@ public:
         static const FunctionInfo functions[] = {
             {0, nullptr, "Initialize"},
             {1, nullptr, "Finalize"},
-            {2, nullptr, "GetState"},
-            {3, nullptr, "IsNfcEnabled"},
-            {100, nullptr, "SetNfcEnabled"},
+            {2, nullptr, "GetStateOld"},
+            {3, nullptr, "IsNfcEnabledOld"},
+            {100, nullptr, "SetNfcEnabledOld"},
             {400, nullptr, "InitializeSystem"},
             {401, nullptr, "FinalizeSystem"},
             {402, nullptr, "GetState"},
diff --git a/src/core/hle/service/ns/ns.cpp b/src/core/hle/service/ns/ns.cpp
index 3e4dd2f7a..886450be2 100644
--- a/src/core/hle/service/ns/ns.cpp
+++ b/src/core/hle/service/ns/ns.cpp
@@ -366,7 +366,8 @@ ResultVal<u8> IApplicationManagerInterface::GetApplicationDesiredLanguage(
     LOG_DEBUG(Service_NS, "called with supported_languages={:08X}", supported_languages);
 
     // Get language code from settings
-    const auto language_code = Set::GetLanguageCodeFromIndex(Settings::values.language_index);
+    const auto language_code =
+        Set::GetLanguageCodeFromIndex(Settings::values.language_index.GetValue());
 
     // Convert to application language, get priority list
     const auto application_language = ConvertToApplicationLanguage(language_code);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index cc2192e5c..fba89e7a6 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input,
     case IoctlCommand::IocGetCharacteristicsCommand:
         return GetCharacteristics(input, output, output2, version);
     case IoctlCommand::IocGetTPCMasksCommand:
-        return GetTPCMasks(input, output);
+        return GetTPCMasks(input, output, output2, version);
     case IoctlCommand::IocGetActiveSlotMaskCommand:
         return GetActiveSlotMask(input, output);
     case IoctlCommand::IocZcullGetCtxSizeCommand:
@@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector<u8>& input, std::vecto
     return 0;
 }
 
-u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output,
+                                 std::vector<u8>& output2, IoctlVersion version) {
     IoctlGpuGetTpcMasksArgs params{};
     std::memcpy(&params, input.data(), input.size());
-    LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size,
-             params.mask_buf_addr);
-    // TODO(ogniK): Confirm value on hardware
-    if (params.mask_buf_size)
-        params.tpc_mask_size = 4 * 1; // 4 * num_gpc
-    else
-        params.tpc_mask_size = 0;
-    std::memcpy(output.data(), &params, sizeof(params));
+    LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
+    if (params.mask_buffer_size != 0) {
+        params.tcp_mask = 3;
+    }
+
+    if (version == IoctlVersion::Version3) {
+        std::memcpy(output.data(), input.data(), output.size());
+        std::memcpy(output2.data(), &params.tcp_mask, output2.size());
+    } else {
+        std::memcpy(output.data(), &params, output.size());
+    }
+
     return 0;
 }
 
@@ -195,8 +200,7 @@ u32 nvhost_ctrl_gpu::GetGpuTime(const std::vector<u8>& input, std::vector<u8>& o
 
     IoctlGetGpuTime params{};
     std::memcpy(&params, input.data(), input.size());
-    const auto ns = Core::Timing::CyclesToNs(system.CoreTiming().GetTicks());
-    params.gpu_time = static_cast<u64_le>(ns.count());
+    params.gpu_time = static_cast<u64_le>(system.CoreTiming().GetGlobalTimeNs().count());
     std::memcpy(output.data(), &params, output.size());
     return 0;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
index 07b644ec5..ef60f72ce 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -92,16 +92,11 @@ private:
                   "IoctlCharacteristics is incorrect size");
 
     struct IoctlGpuGetTpcMasksArgs {
-        /// [in]  TPC mask buffer size reserved by userspace. Should be at least
-        /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC.
-        /// [out] full kernel buffer size
-        u32_le mask_buf_size;
-        u32_le reserved;
-
-        /// [in]  pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if
-        /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0.
-        u64_le mask_buf_addr;
-        u64_le tpc_mask_size; // Nintendo add this?
+        u32_le mask_buffer_size{};
+        INSERT_PADDING_WORDS(1);
+        u64_le mask_buffer_address{};
+        u32_le tcp_mask{};
+        INSERT_PADDING_WORDS(1);
     };
     static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24,
                   "IoctlGpuGetTpcMasksArgs is incorrect size");
@@ -166,7 +161,8 @@ private:
 
     u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output,
                            std::vector<u8>& output2, IoctlVersion version);
-    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, std::vector<u8>& output2,
+                    IoctlVersion version);
     u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output);
     u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output);
     u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output);
diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 437bc5dee..2f44d3779 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -9,6 +9,7 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
@@ -27,8 +28,35 @@
 
 namespace Service::NVFlinger {
 
-constexpr s64 frame_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
-constexpr s64 frame_ticks_30fps = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 30);
+constexpr s64 frame_ticks = static_cast<s64>(1000000000 / 60);
+constexpr s64 frame_ticks_30fps = static_cast<s64>(1000000000 / 30);
+
+void NVFlinger::VSyncThread(NVFlinger& nv_flinger) {
+    nv_flinger.SplitVSync();
+}
+
+void NVFlinger::SplitVSync() {
+    system.RegisterHostThread();
+    std::string name = "yuzu:VSyncThread";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    s64 delay = 0;
+    while (is_running) {
+        guard->lock();
+        const s64 time_start = system.CoreTiming().GetGlobalTimeNs().count();
+        Compose();
+        const auto ticks = GetNextTicks();
+        const s64 time_end = system.CoreTiming().GetGlobalTimeNs().count();
+        const s64 time_passed = time_end - time_start;
+        const s64 next_time = std::max<s64>(0, ticks - time_passed - delay);
+        guard->unlock();
+        if (next_time > 0) {
+            wait_event->WaitFor(std::chrono::nanoseconds{next_time});
+        }
+        delay = (system.CoreTiming().GetGlobalTimeNs().count() - time_end) - next_time;
+    }
+}
 
 NVFlinger::NVFlinger(Core::System& system) : system(system) {
     displays.emplace_back(0, "Default", system);
@@ -36,22 +64,36 @@ NVFlinger::NVFlinger(Core::System& system) : system(system) {
     displays.emplace_back(2, "Edid", system);
     displays.emplace_back(3, "Internal", system);
     displays.emplace_back(4, "Null", system);
+    guard = std::make_shared<std::mutex>();
 
     // Schedule the screen composition events
     composition_event =
-        Core::Timing::CreateEvent("ScreenComposition", [this](u64 userdata, s64 cycles_late) {
+        Core::Timing::CreateEvent("ScreenComposition", [this](u64 userdata, s64 ns_late) {
+            Lock();
             Compose();
-            const auto ticks =
-                Settings::values.force_30fps_mode ? frame_ticks_30fps : GetNextTicks();
-            this->system.CoreTiming().ScheduleEvent(std::max<s64>(0LL, ticks - cycles_late),
+            const auto ticks = GetNextTicks();
+            this->system.CoreTiming().ScheduleEvent(std::max<s64>(0LL, ticks - ns_late),
                                                     composition_event);
         });
-
-    system.CoreTiming().ScheduleEvent(frame_ticks, composition_event);
+    if (system.IsMulticore()) {
+        is_running = true;
+        wait_event = std::make_unique<Common::Event>();
+        vsync_thread = std::make_unique<std::thread>(VSyncThread, std::ref(*this));
+    } else {
+        system.CoreTiming().ScheduleEvent(frame_ticks, composition_event);
+    }
 }
 
 NVFlinger::~NVFlinger() {
-    system.CoreTiming().UnscheduleEvent(composition_event, 0);
+    if (system.IsMulticore()) {
+        is_running = false;
+        wait_event->Set();
+        vsync_thread->join();
+        vsync_thread.reset();
+        wait_event.reset();
+    } else {
+        system.CoreTiming().UnscheduleEvent(composition_event, 0);
+    }
 }
 
 void NVFlinger::SetNVDrvInstance(std::shared_ptr<Nvidia::Module> instance) {
@@ -199,10 +241,12 @@ void NVFlinger::Compose() {
 
         auto& gpu = system.GPU();
         const auto& multi_fence = buffer->get().multi_fence;
+        guard->unlock();
         for (u32 fence_id = 0; fence_id < multi_fence.num_fences; fence_id++) {
             const auto& fence = multi_fence.fences[fence_id];
             gpu.WaitFence(fence.id, fence.value);
         }
+        guard->lock();
 
         MicroProfileFlip();
 
@@ -223,7 +267,7 @@ void NVFlinger::Compose() {
 
 s64 NVFlinger::GetNextTicks() const {
     constexpr s64 max_hertz = 120LL;
-    return (Core::Hardware::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
+    return (1000000000 * (1LL << swap_interval)) / max_hertz;
 }
 
 } // namespace Service::NVFlinger
diff --git a/src/core/hle/service/nvflinger/nvflinger.h b/src/core/hle/service/nvflinger/nvflinger.h
index 57a21f33b..e4959a9af 100644
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -4,15 +4,22 @@
 
 #pragma once
 
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <string>
 #include <string_view>
+#include <thread>
 #include <vector>
 
 #include "common/common_types.h"
 #include "core/hle/kernel/object.h"
 
+namespace Common {
+class Event;
+} // namespace Common
+
 namespace Core::Timing {
 class CoreTiming;
 struct EventType;
@@ -79,6 +86,10 @@ public:
 
     s64 GetNextTicks() const;
 
+    std::unique_lock<std::mutex> Lock() {
+        return std::unique_lock{*guard};
+    }
+
 private:
     /// Finds the display identified by the specified ID.
     VI::Display* FindDisplay(u64 display_id);
@@ -92,6 +103,10 @@ private:
     /// Finds the layer identified by the specified ID in the desired display.
     const VI::Layer* FindLayer(u64 display_id, u64 layer_id) const;
 
+    static void VSyncThread(NVFlinger& nv_flinger);
+
+    void SplitVSync();
+
     std::shared_ptr<Nvidia::Module> nvdrv;
 
     std::vector<VI::Display> displays;
@@ -108,7 +123,13 @@ private:
     /// Event that handles screen composition.
     std::shared_ptr<Core::Timing::EventType> composition_event;
 
+    std::shared_ptr<std::mutex> guard;
+
     Core::System& system;
+
+    std::unique_ptr<std::thread> vsync_thread;
+    std::unique_ptr<Common::Event> wait_event;
+    std::atomic<bool> is_running{};
 };
 
 } // namespace Service::NVFlinger
diff --git a/src/core/hle/service/prepo/prepo.cpp b/src/core/hle/service/prepo/prepo.cpp
index 9d36ea0d0..cde3312da 100644
--- a/src/core/hle/service/prepo/prepo.cpp
+++ b/src/core/hle/service/prepo/prepo.cpp
@@ -80,8 +80,13 @@ private:
         const auto user_id = rp.PopRaw<u128>();
         const auto process_id = rp.PopRaw<u64>();
         std::vector<std::vector<u8>> data{ctx.ReadBuffer(0)};
+
         if constexpr (Type == Core::Reporter::PlayReportType::Old2) {
-            data.emplace_back(ctx.ReadBuffer(1));
+            const auto read_buffer_count =
+                ctx.BufferDescriptorX().size() + ctx.BufferDescriptorA().size();
+            if (read_buffer_count > 1) {
+                data.emplace_back(ctx.ReadBuffer(1));
+            }
         }
 
         LOG_DEBUG(
diff --git a/src/core/hle/service/set/set.cpp b/src/core/hle/service/set/set.cpp
index f3b4b286c..34fe2fd82 100644
--- a/src/core/hle/service/set/set.cpp
+++ b/src/core/hle/service/set/set.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <array>
 #include <chrono>
 #include "common/logging/log.h"
 #include "core/hle/ipc_helpers.h"
@@ -31,6 +32,44 @@ constexpr std::array<LanguageCode, 17> available_language_codes = {{
     LanguageCode::ZH_HANT,
 }};
 
+enum class KeyboardLayout : u64 {
+    Japanese = 0,
+    EnglishUs = 1,
+    EnglishUsInternational = 2,
+    EnglishUk = 3,
+    French = 4,
+    FrenchCa = 5,
+    Spanish = 6,
+    SpanishLatin = 7,
+    German = 8,
+    Italian = 9,
+    Portuguese = 10,
+    Russian = 11,
+    Korean = 12,
+    ChineseSimplified = 13,
+    ChineseTraditional = 14,
+};
+
+constexpr std::array<std::pair<LanguageCode, KeyboardLayout>, 17> language_to_layout{{
+    {LanguageCode::JA, KeyboardLayout::Japanese},
+    {LanguageCode::EN_US, KeyboardLayout::EnglishUs},
+    {LanguageCode::FR, KeyboardLayout::French},
+    {LanguageCode::DE, KeyboardLayout::German},
+    {LanguageCode::IT, KeyboardLayout::Italian},
+    {LanguageCode::ES, KeyboardLayout::Spanish},
+    {LanguageCode::ZH_CN, KeyboardLayout::ChineseSimplified},
+    {LanguageCode::KO, KeyboardLayout::Korean},
+    {LanguageCode::NL, KeyboardLayout::EnglishUsInternational},
+    {LanguageCode::PT, KeyboardLayout::Portuguese},
+    {LanguageCode::RU, KeyboardLayout::Russian},
+    {LanguageCode::ZH_TW, KeyboardLayout::ChineseTraditional},
+    {LanguageCode::EN_GB, KeyboardLayout::EnglishUk},
+    {LanguageCode::FR_CA, KeyboardLayout::FrenchCa},
+    {LanguageCode::ES_419, KeyboardLayout::SpanishLatin},
+    {LanguageCode::ZH_HANS, KeyboardLayout::ChineseSimplified},
+    {LanguageCode::ZH_HANT, KeyboardLayout::ChineseTraditional},
+}};
+
 constexpr std::size_t pre4_0_0_max_entries = 15;
 constexpr std::size_t post4_0_0_max_entries = 17;
 
@@ -50,6 +89,25 @@ void GetAvailableLanguageCodesImpl(Kernel::HLERequestContext& ctx, std::size_t m
     ctx.WriteBuffer(available_language_codes.data(), copy_size);
     PushResponseLanguageCode(ctx, copy_amount);
 }
+
+void GetKeyCodeMapImpl(Kernel::HLERequestContext& ctx) {
+    const auto language_code = available_language_codes[Settings::values.language_index.GetValue()];
+    const auto key_code =
+        std::find_if(language_to_layout.cbegin(), language_to_layout.cend(),
+                     [=](const auto& element) { return element.first == language_code; });
+    KeyboardLayout layout = KeyboardLayout::EnglishUs;
+    if (key_code == language_to_layout.cend()) {
+        LOG_ERROR(Service_SET,
+                  "Could not find keyboard layout for language index {}, defaulting to English us",
+                  Settings::values.language_index.GetValue());
+    } else {
+        layout = key_code->second;
+    }
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+    ctx.WriteBuffer(&layout, sizeof(KeyboardLayout));
+}
 } // Anonymous namespace
 
 LanguageCode GetLanguageCodeFromIndex(std::size_t index) {
@@ -105,11 +163,11 @@ void SET::GetQuestFlag(Kernel::HLERequestContext& ctx) {
 }
 
 void SET::GetLanguageCode(Kernel::HLERequestContext& ctx) {
-    LOG_DEBUG(Service_SET, "called {}", Settings::values.language_index);
+    LOG_DEBUG(Service_SET, "called {}", Settings::values.language_index.GetValue());
 
     IPC::ResponseBuilder rb{ctx, 4};
     rb.Push(RESULT_SUCCESS);
-    rb.PushEnum(available_language_codes[Settings::values.language_index]);
+    rb.PushEnum(available_language_codes[Settings::values.language_index.GetValue()]);
 }
 
 void SET::GetRegionCode(Kernel::HLERequestContext& ctx) {
@@ -117,7 +175,17 @@ void SET::GetRegionCode(Kernel::HLERequestContext& ctx) {
 
     IPC::ResponseBuilder rb{ctx, 3};
     rb.Push(RESULT_SUCCESS);
-    rb.Push(Settings::values.region_index);
+    rb.Push(Settings::values.region_index.GetValue());
+}
+
+void SET::GetKeyCodeMap(Kernel::HLERequestContext& ctx) {
+    LOG_DEBUG(Service_SET, "Called {}", ctx.Description());
+    GetKeyCodeMapImpl(ctx);
+}
+
+void SET::GetKeyCodeMap2(Kernel::HLERequestContext& ctx) {
+    LOG_DEBUG(Service_SET, "Called {}", ctx.Description());
+    GetKeyCodeMapImpl(ctx);
 }
 
 SET::SET() : ServiceFramework("set") {
@@ -130,9 +198,9 @@ SET::SET() : ServiceFramework("set") {
         {4, &SET::GetRegionCode, "GetRegionCode"},
         {5, &SET::GetAvailableLanguageCodes2, "GetAvailableLanguageCodes2"},
         {6, &SET::GetAvailableLanguageCodeCount2, "GetAvailableLanguageCodeCount2"},
-        {7, nullptr, "GetKeyCodeMap"},
+        {7, &SET::GetKeyCodeMap, "GetKeyCodeMap"},
         {8, &SET::GetQuestFlag, "GetQuestFlag"},
-        {9, nullptr, "GetKeyCodeMap2"},
+        {9, &SET::GetKeyCodeMap2, "GetKeyCodeMap2"},
         {10, nullptr, "GetFirmwareVersionForDebug"},
     };
     // clang-format on
diff --git a/src/core/hle/service/set/set.h b/src/core/hle/service/set/set.h
index 6084b345d..8ac9c169d 100644
--- a/src/core/hle/service/set/set.h
+++ b/src/core/hle/service/set/set.h
@@ -44,6 +44,8 @@ private:
     void GetAvailableLanguageCodeCount2(Kernel::HLERequestContext& ctx);
     void GetQuestFlag(Kernel::HLERequestContext& ctx);
     void GetRegionCode(Kernel::HLERequestContext& ctx);
+    void GetKeyCodeMap(Kernel::HLERequestContext& ctx);
+    void GetKeyCodeMap2(Kernel::HLERequestContext& ctx);
 };
 
 } // namespace Service::Set
diff --git a/src/core/hle/service/sm/sm.cpp b/src/core/hle/service/sm/sm.cpp
index 6ada13be4..d872de16c 100644
--- a/src/core/hle/service/sm/sm.cpp
+++ b/src/core/hle/service/sm/sm.cpp
@@ -142,7 +142,7 @@ void SM::GetService(Kernel::HLERequestContext& ctx) {
     }
 
     // Wake the threads waiting on the ServerPort
-    server_port->WakeupAllWaitingThreads();
+    server_port->Signal();
 
     LOG_DEBUG(Service_SM, "called service={} -> session={}", name, client->GetObjectId());
     IPC::ResponseBuilder rb{ctx, 2, 0, 1, IPC::ResponseBuilder::Flags::AlwaysMoveHandles};
diff --git a/src/core/hle/service/spl/module.cpp b/src/core/hle/service/spl/module.cpp
index e724d4ab8..865ed3b91 100644
--- a/src/core/hle/service/spl/module.cpp
+++ b/src/core/hle/service/spl/module.cpp
@@ -19,7 +19,7 @@ namespace Service::SPL {
 
 Module::Interface::Interface(std::shared_ptr<Module> module, const char* name)
     : ServiceFramework(name), module(std::move(module)),
-      rng(Settings::values.rng_seed.value_or(std::time(nullptr))) {}
+      rng(Settings::values.rng_seed.GetValue().value_or(std::time(nullptr))) {}
 
 Module::Interface::~Interface() = default;
 
diff --git a/src/core/hle/service/time/standard_steady_clock_core.cpp b/src/core/hle/service/time/standard_steady_clock_core.cpp
index 1575f0b49..59a272f4a 100644
--- a/src/core/hle/service/time/standard_steady_clock_core.cpp
+++ b/src/core/hle/service/time/standard_steady_clock_core.cpp
@@ -11,9 +11,8 @@
 namespace Service::Time::Clock {
 
 TimeSpanType StandardSteadyClockCore::GetCurrentRawTimePoint(Core::System& system) {
-    const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+    const TimeSpanType ticks_time_span{
+        TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
     TimeSpanType raw_time_point{setup_value.nanoseconds + ticks_time_span.nanoseconds};
 
     if (raw_time_point.nanoseconds < cached_raw_time_point.nanoseconds) {
diff --git a/src/core/hle/service/time/tick_based_steady_clock_core.cpp b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
index 44d5bc651..8baaa2a6a 100644
--- a/src/core/hle/service/time/tick_based_steady_clock_core.cpp
+++ b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
@@ -11,9 +11,8 @@
 namespace Service::Time::Clock {
 
 SteadyClockTimePoint TickBasedSteadyClockCore::GetTimePoint(Core::System& system) {
-    const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+    const TimeSpanType ticks_time_span{
+        TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
 
     return {ticks_time_span.ToSeconds(), GetClockSourceId()};
 }
diff --git a/src/core/hle/service/time/time.cpp b/src/core/hle/service/time/time.cpp
index cc1dbd575..13e4b3818 100644
--- a/src/core/hle/service/time/time.cpp
+++ b/src/core/hle/service/time/time.cpp
@@ -241,9 +241,8 @@ void Module::Interface::CalculateMonotonicSystemClockBaseTimePoint(Kernel::HLERe
     const auto current_time_point{steady_clock_core.GetCurrentTimePoint(system)};
 
     if (current_time_point.clock_source_id == context.steady_time_point.clock_source_id) {
-        const auto ticks{Clock::TimeSpanType::FromTicks(
-            Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-            Core::Hardware::CNTFREQ)};
+        const auto ticks{Clock::TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(),
+                                                        Core::Hardware::CNTFREQ)};
         const s64 base_time_point{context.offset + current_time_point.time_point -
                                   ticks.ToSeconds()};
         IPC::ResponseBuilder rb{ctx, (sizeof(s64) / 4) + 2};
diff --git a/src/core/hle/service/time/time_sharedmemory.cpp b/src/core/hle/service/time/time_sharedmemory.cpp
index 999ec1e51..e0ae9f874 100644
--- a/src/core/hle/service/time/time_sharedmemory.cpp
+++ b/src/core/hle/service/time/time_sharedmemory.cpp
@@ -30,8 +30,7 @@ void SharedMemory::SetupStandardSteadyClock(Core::System& system,
                                             const Common::UUID& clock_source_id,
                                             Clock::TimeSpanType current_time_point) {
     const Clock::TimeSpanType ticks_time_span{Clock::TimeSpanType::FromTicks(
-        Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Hardware::CNTFREQ)};
+        system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)};
     const Clock::SteadyClockContext context{
         static_cast<u64>(current_time_point.nanoseconds - ticks_time_span.nanoseconds),
         clock_source_id};
diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp
index 67b45e7c0..ea7b4ae13 100644
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -511,6 +511,7 @@ private:
         LOG_DEBUG(Service_VI, "called. id=0x{:08X} transaction={:X}, flags=0x{:08X}", id,
                   static_cast<u32>(transaction), flags);
 
+        nv_flinger->Lock();
         auto& buffer_queue = nv_flinger->FindBufferQueue(id);
 
         switch (transaction) {
@@ -518,9 +519,9 @@ private:
             IGBPConnectRequestParcel request{ctx.ReadBuffer()};
             IGBPConnectResponseParcel response{
                 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedWidth) *
-                                 Settings::values.resolution_factor),
+                                 Settings::values.resolution_factor.GetValue()),
                 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedHeight) *
-                                 Settings::values.resolution_factor)};
+                                 Settings::values.resolution_factor.GetValue())};
             ctx.WriteBuffer(response.Serialize());
             break;
         }
@@ -550,6 +551,7 @@ private:
                     [=](std::shared_ptr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
                         Kernel::ThreadWakeupReason reason) {
                         // Repeat TransactParcel DequeueBuffer when a buffer is available
+                        nv_flinger->Lock();
                         auto& buffer_queue = nv_flinger->FindBufferQueue(id);
                         auto result = buffer_queue.DequeueBuffer(width, height);
                         ASSERT_MSG(result != std::nullopt, "Could not dequeue buffer.");
@@ -747,14 +749,14 @@ private:
 
         if (Settings::values.use_docked_mode) {
             rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedWidth) *
-                    static_cast<u32>(Settings::values.resolution_factor));
+                    static_cast<u32>(Settings::values.resolution_factor.GetValue()));
             rb.Push(static_cast<u32>(Service::VI::DisplayResolution::DockedHeight) *
-                    static_cast<u32>(Settings::values.resolution_factor));
+                    static_cast<u32>(Settings::values.resolution_factor.GetValue()));
         } else {
             rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedWidth) *
-                    static_cast<u32>(Settings::values.resolution_factor));
+                    static_cast<u32>(Settings::values.resolution_factor.GetValue()));
             rb.Push(static_cast<u32>(Service::VI::DisplayResolution::UndockedHeight) *
-                    static_cast<u32>(Settings::values.resolution_factor));
+                    static_cast<u32>(Settings::values.resolution_factor.GetValue()));
         }
 
         rb.PushRaw<float>(60.0f); // This wouldn't seem to be correct for 30 fps games.
@@ -1029,9 +1031,9 @@ private:
         // between docked and undocked dimensions. We take the liberty of applying
         // the resolution scaling factor here.
         rb.Push(static_cast<u64>(DisplayResolution::UndockedWidth) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
         rb.Push(static_cast<u64>(DisplayResolution::UndockedHeight) *
-                static_cast<u32>(Settings::values.resolution_factor));
+                static_cast<u32>(Settings::values.resolution_factor.GetValue()));
     }
 
     void SetLayerScalingMode(Kernel::HLERequestContext& ctx) {
@@ -1064,8 +1066,8 @@ private:
         LOG_WARNING(Service_VI, "(STUBBED) called");
 
         DisplayInfo display_info;
-        display_info.width *= static_cast<u64>(Settings::values.resolution_factor);
-        display_info.height *= static_cast<u64>(Settings::values.resolution_factor);
+        display_info.width *= static_cast<u64>(Settings::values.resolution_factor.GetValue());
+        display_info.height *= static_cast<u64>(Settings::values.resolution_factor.GetValue());
         ctx.WriteBuffer(&display_info, sizeof(DisplayInfo));
         IPC::ResponseBuilder rb{ctx, 4};
         rb.Push(RESULT_SUCCESS);
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 9d87045a0..2c5588933 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -8,6 +8,7 @@
 #include <utility>
 
 #include "common/assert.h"
+#include "common/atomic_ops.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/page_table.h"
@@ -29,15 +30,12 @@ namespace Core::Memory {
 struct Memory::Impl {
     explicit Impl(Core::System& system_) : system{system_} {}
 
-    void SetCurrentPageTable(Kernel::Process& process) {
+    void SetCurrentPageTable(Kernel::Process& process, u32 core_id) {
         current_page_table = &process.PageTable().PageTableImpl();
 
         const std::size_t address_space_width = process.PageTable().GetAddressSpaceWidth();
 
-        system.ArmInterface(0).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(1).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(2).PageTableChanged(*current_page_table, address_space_width);
-        system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width);
+        system.ArmInterface(core_id).PageTableChanged(*current_page_table, address_space_width);
     }
 
     void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, PAddr target) {
@@ -179,6 +177,22 @@ struct Memory::Impl {
         }
     }
 
+    bool WriteExclusive8(const VAddr addr, const u8 data, const u8 expected) {
+        return WriteExclusive<u8>(addr, data, expected);
+    }
+
+    bool WriteExclusive16(const VAddr addr, const u16 data, const u16 expected) {
+        return WriteExclusive<u16_le>(addr, data, expected);
+    }
+
+    bool WriteExclusive32(const VAddr addr, const u32 data, const u32 expected) {
+        return WriteExclusive<u32_le>(addr, data, expected);
+    }
+
+    bool WriteExclusive64(const VAddr addr, const u64 data, const u64 expected) {
+        return WriteExclusive<u64_le>(addr, data, expected);
+    }
+
     std::string ReadCString(VAddr vaddr, std::size_t max_length) {
         std::string string;
         string.reserve(max_length);
@@ -534,9 +548,9 @@ struct Memory::Impl {
                         // longer exist, and we should just leave the pagetable entry blank.
                         page_type = Common::PageType::Unmapped;
                     } else {
-                        page_type = Common::PageType::Memory;
                         current_page_table->pointers[vaddr >> PAGE_BITS] =
                             pointer - (vaddr & ~PAGE_MASK);
+                        page_type = Common::PageType::Memory;
                     }
                     break;
                 }
@@ -577,9 +591,12 @@ struct Memory::Impl {
                    base + page_table.pointers.size());
 
         if (!target) {
+            ASSERT_MSG(type != Common::PageType::Memory,
+                       "Mapping memory page without a pointer @ {:016x}", base * PAGE_SIZE);
+
             while (base != end) {
-                page_table.pointers[base] = nullptr;
                 page_table.attributes[base] = type;
+                page_table.pointers[base] = nullptr;
                 page_table.backing_addr[base] = 0;
 
                 base += 1;
@@ -682,6 +699,67 @@ struct Memory::Impl {
         }
     }
 
+    template <typename T>
+    bool WriteExclusive(const VAddr vaddr, const T data, const T expected) {
+        u8* page_pointer = current_page_table->pointers[vaddr >> PAGE_BITS];
+        if (page_pointer != nullptr) {
+            // NOTE: Avoid adding any extra logic to this fast-path block
+            T volatile* pointer = reinterpret_cast<T volatile*>(&page_pointer[vaddr]);
+            return Common::AtomicCompareAndSwap(pointer, data, expected);
+        }
+
+        const Common::PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
+        switch (type) {
+        case Common::PageType::Unmapped:
+            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
+                      static_cast<u32>(data), vaddr);
+            return true;
+        case Common::PageType::Memory:
+            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
+            break;
+        case Common::PageType::RasterizerCachedMemory: {
+            u8* host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
+            system.GPU().InvalidateRegion(vaddr, sizeof(T));
+            T volatile* pointer = reinterpret_cast<T volatile*>(&host_ptr);
+            return Common::AtomicCompareAndSwap(pointer, data, expected);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
+        return true;
+    }
+
+    bool WriteExclusive128(const VAddr vaddr, const u128 data, const u128 expected) {
+        u8* const page_pointer = current_page_table->pointers[vaddr >> PAGE_BITS];
+        if (page_pointer != nullptr) {
+            // NOTE: Avoid adding any extra logic to this fast-path block
+            u64 volatile* pointer = reinterpret_cast<u64 volatile*>(&page_pointer[vaddr]);
+            return Common::AtomicCompareAndSwap(pointer, data, expected);
+        }
+
+        const Common::PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
+        switch (type) {
+        case Common::PageType::Unmapped:
+            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}{:016X}", sizeof(data) * 8,
+                      static_cast<u64>(data[1]), static_cast<u64>(data[0]), vaddr);
+            return true;
+        case Common::PageType::Memory:
+            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
+            break;
+        case Common::PageType::RasterizerCachedMemory: {
+            u8* host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
+            system.GPU().InvalidateRegion(vaddr, sizeof(u128));
+            u64 volatile* pointer = reinterpret_cast<u64 volatile*>(&host_ptr);
+            return Common::AtomicCompareAndSwap(pointer, data, expected);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
+        return true;
+    }
+
     Common::PageTable* current_page_table = nullptr;
     Core::System& system;
 };
@@ -689,8 +767,8 @@ struct Memory::Impl {
 Memory::Memory(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
 Memory::~Memory() = default;
 
-void Memory::SetCurrentPageTable(Kernel::Process& process) {
-    impl->SetCurrentPageTable(process);
+void Memory::SetCurrentPageTable(Kernel::Process& process, u32 core_id) {
+    impl->SetCurrentPageTable(process, core_id);
 }
 
 void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, PAddr target) {
@@ -764,6 +842,26 @@ void Memory::Write64(VAddr addr, u64 data) {
     impl->Write64(addr, data);
 }
 
+bool Memory::WriteExclusive8(VAddr addr, u8 data, u8 expected) {
+    return impl->WriteExclusive8(addr, data, expected);
+}
+
+bool Memory::WriteExclusive16(VAddr addr, u16 data, u16 expected) {
+    return impl->WriteExclusive16(addr, data, expected);
+}
+
+bool Memory::WriteExclusive32(VAddr addr, u32 data, u32 expected) {
+    return impl->WriteExclusive32(addr, data, expected);
+}
+
+bool Memory::WriteExclusive64(VAddr addr, u64 data, u64 expected) {
+    return impl->WriteExclusive64(addr, data, expected);
+}
+
+bool Memory::WriteExclusive128(VAddr addr, u128 data, u128 expected) {
+    return impl->WriteExclusive128(addr, data, expected);
+}
+
 std::string Memory::ReadCString(VAddr vaddr, std::size_t max_length) {
     return impl->ReadCString(vaddr, max_length);
 }
diff --git a/src/core/memory.h b/src/core/memory.h
index 9292f3b0a..4a1cc63f4 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,7 +64,7 @@ public:
      *
      * @param process The process to use the page table of.
      */
-    void SetCurrentPageTable(Kernel::Process& process);
+    void SetCurrentPageTable(Kernel::Process& process, u32 core_id);
 
     /**
      * Maps an allocated buffer onto a region of the emulated process address space.
@@ -245,6 +245,71 @@ public:
     void Write64(VAddr addr, u64 data);
 
     /**
+     * Writes a 8-bit unsigned integer to the given virtual address in
+     * the current process' address space if and only if the address contains
+     * the expected value. This operation is atomic.
+     *
+     * @param addr The virtual address to write the 8-bit unsigned integer to.
+     * @param data The 8-bit unsigned integer to write to the given virtual address.
+     * @param expected The 8-bit unsigned integer to check against the given virtual address.
+     *
+     * @post The memory range [addr, sizeof(data)) contains the given data value.
+     */
+    bool WriteExclusive8(VAddr addr, u8 data, u8 expected);
+
+    /**
+     * Writes a 16-bit unsigned integer to the given virtual address in
+     * the current process' address space if and only if the address contains
+     * the expected value. This operation is atomic.
+     *
+     * @param addr The virtual address to write the 16-bit unsigned integer to.
+     * @param data The 16-bit unsigned integer to write to the given virtual address.
+     * @param expected The 16-bit unsigned integer to check against the given virtual address.
+     *
+     * @post The memory range [addr, sizeof(data)) contains the given data value.
+     */
+    bool WriteExclusive16(VAddr addr, u16 data, u16 expected);
+
+    /**
+     * Writes a 32-bit unsigned integer to the given virtual address in
+     * the current process' address space if and only if the address contains
+     * the expected value. This operation is atomic.
+     *
+     * @param addr The virtual address to write the 32-bit unsigned integer to.
+     * @param data The 32-bit unsigned integer to write to the given virtual address.
+     * @param expected The 32-bit unsigned integer to check against the given virtual address.
+     *
+     * @post The memory range [addr, sizeof(data)) contains the given data value.
+     */
+    bool WriteExclusive32(VAddr addr, u32 data, u32 expected);
+
+    /**
+     * Writes a 64-bit unsigned integer to the given virtual address in
+     * the current process' address space if and only if the address contains
+     * the expected value. This operation is atomic.
+     *
+     * @param addr The virtual address to write the 64-bit unsigned integer to.
+     * @param data The 64-bit unsigned integer to write to the given virtual address.
+     * @param expected The 64-bit unsigned integer to check against the given virtual address.
+     *
+     * @post The memory range [addr, sizeof(data)) contains the given data value.
+     */
+    bool WriteExclusive64(VAddr addr, u64 data, u64 expected);
+
+    /**
+     * Writes a 128-bit unsigned integer to the given virtual address in
+     * the current process' address space if and only if the address contains
+     * the expected value. This operation is atomic.
+     *
+     * @param addr The virtual address to write the 128-bit unsigned integer to.
+     * @param data The 128-bit unsigned integer to write to the given virtual address.
+     * @param expected The 128-bit unsigned integer to check against the given virtual address.
+     *
+     * @post The memory range [addr, sizeof(data)) contains the given data value.
+     */
+    bool WriteExclusive128(VAddr addr, u128 data, u128 expected);
+
+    /**
      * Reads a null-terminated string from the given virtual address.
      * This function will continually read characters until either:
      *
diff --git a/src/core/memory/cheat_engine.cpp b/src/core/memory/cheat_engine.cpp
index b139e8465..53d27859b 100644
--- a/src/core/memory/cheat_engine.cpp
+++ b/src/core/memory/cheat_engine.cpp
@@ -20,7 +20,7 @@
 
 namespace Core::Memory {
 
-constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 12);
+constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(1000000000 / 12);
 constexpr u32 KEYPAD_BITMASK = 0x3FFFFFF;
 
 StandardVmCallbacks::StandardVmCallbacks(Core::System& system, const CheatProcessMetadata& metadata)
@@ -190,7 +190,7 @@ CheatEngine::~CheatEngine() {
 void CheatEngine::Initialize() {
     event = Core::Timing::CreateEvent(
         "CheatEngine::FrameCallback::" + Common::HexToString(metadata.main_nso_build_id),
-        [this](u64 userdata, s64 cycles_late) { FrameCallback(userdata, cycles_late); });
+        [this](u64 userdata, s64 ns_late) { FrameCallback(userdata, ns_late); });
     core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS, event);
 
     metadata.process_id = system.CurrentProcess()->GetProcessID();
@@ -217,7 +217,7 @@ void CheatEngine::Reload(std::vector<CheatEntry> cheats) {
 
 MICROPROFILE_DEFINE(Cheat_Engine, "Add-Ons", "Cheat Engine", MP_RGB(70, 200, 70));
 
-void CheatEngine::FrameCallback(u64 userdata, s64 cycles_late) {
+void CheatEngine::FrameCallback(u64 userdata, s64 ns_late) {
     if (is_pending_reload.exchange(false)) {
         vm.LoadProgram(cheats);
     }
@@ -230,7 +230,7 @@ void CheatEngine::FrameCallback(u64 userdata, s64 cycles_late) {
 
     vm.Execute(metadata);
 
-    core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS - cycles_late, event);
+    core_timing.ScheduleEvent(CHEAT_ENGINE_TICKS - ns_late, event);
 }
 
 } // namespace Core::Memory
diff --git a/src/core/perf_stats.cpp b/src/core/perf_stats.cpp
index f1ae9d4df..29339ead7 100644
--- a/src/core/perf_stats.cpp
+++ b/src/core/perf_stats.cpp
@@ -119,13 +119,14 @@ double PerfStats::GetLastFrameTimeScale() {
 }
 
 void FrameLimiter::DoFrameLimiting(microseconds current_system_time_us) {
-    if (!Settings::values.use_frame_limit) {
+    if (!Settings::values.use_frame_limit.GetValue() ||
+        Settings::values.use_multi_core.GetValue()) {
         return;
     }
 
     auto now = Clock::now();
 
-    const double sleep_scale = Settings::values.frame_limit / 100.0;
+    const double sleep_scale = Settings::values.frame_limit.GetValue() / 100.0;
 
     // Max lag caused by slow frames. Shouldn't be more than the length of a frame at the current
     // speed percent or it will clamp too much and prevent this from properly limiting to that
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index 4edff9cd8..d3886c4ec 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -62,6 +62,7 @@ const std::array<const char*, NumMouseButtons> mapping = {{
 }
 
 Values values = {};
+bool configuring_global = true;
 
 std::string GetTimeZoneString() {
     static constexpr std::array<const char*, 46> timezones{{
@@ -73,9 +74,9 @@ std::string GetTimeZoneString() {
         "UCT",       "Universal", "UTC", "W-SU",    "WET",     "Zulu",
     }};
 
-    ASSERT(Settings::values.time_zone_index < timezones.size());
+    ASSERT(Settings::values.time_zone_index.GetValue() < timezones.size());
 
-    return timezones[Settings::values.time_zone_index];
+    return timezones[Settings::values.time_zone_index.GetValue()];
 }
 
 void Apply() {
@@ -97,25 +98,25 @@ void LogSetting(const std::string& name, const T& value) {
 
 void LogSettings() {
     LOG_INFO(Config, "yuzu Configuration:");
-    LogSetting("System_UseDockedMode", Settings::values.use_docked_mode);
-    LogSetting("System_RngSeed", Settings::values.rng_seed.value_or(0));
+    LogSetting("Controls_UseDockedMode", Settings::values.use_docked_mode);
+    LogSetting("System_RngSeed", Settings::values.rng_seed.GetValue().value_or(0));
     LogSetting("System_CurrentUser", Settings::values.current_user);
-    LogSetting("System_LanguageIndex", Settings::values.language_index);
-    LogSetting("System_RegionIndex", Settings::values.region_index);
-    LogSetting("System_TimeZoneIndex", Settings::values.time_zone_index);
-    LogSetting("Core_UseMultiCore", Settings::values.use_multi_core);
-    LogSetting("Renderer_UseResolutionFactor", Settings::values.resolution_factor);
-    LogSetting("Renderer_UseFrameLimit", Settings::values.use_frame_limit);
-    LogSetting("Renderer_FrameLimit", Settings::values.frame_limit);
-    LogSetting("Renderer_UseDiskShaderCache", Settings::values.use_disk_shader_cache);
-    LogSetting("Renderer_GPUAccuracyLevel", Settings::values.gpu_accuracy);
+    LogSetting("System_LanguageIndex", Settings::values.language_index.GetValue());
+    LogSetting("System_RegionIndex", Settings::values.region_index.GetValue());
+    LogSetting("System_TimeZoneIndex", Settings::values.time_zone_index.GetValue());
+    LogSetting("Core_UseMultiCore", Settings::values.use_multi_core.GetValue());
+    LogSetting("Renderer_UseResolutionFactor", Settings::values.resolution_factor.GetValue());
+    LogSetting("Renderer_UseFrameLimit", Settings::values.use_frame_limit.GetValue());
+    LogSetting("Renderer_FrameLimit", Settings::values.frame_limit.GetValue());
+    LogSetting("Renderer_UseDiskShaderCache", Settings::values.use_disk_shader_cache.GetValue());
+    LogSetting("Renderer_GPUAccuracyLevel", Settings::values.gpu_accuracy.GetValue());
     LogSetting("Renderer_UseAsynchronousGpuEmulation",
-               Settings::values.use_asynchronous_gpu_emulation);
-    LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
-    LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
-    LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
+               Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    LogSetting("Renderer_UseVsync", Settings::values.use_vsync.GetValue());
+    LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders.GetValue());
+    LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy.GetValue());
     LogSetting("Audio_OutputEngine", Settings::values.sink_id);
-    LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
+    LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching.GetValue());
     LogSetting("Audio_OutputDevice", Settings::values.audio_device_id);
     LogSetting("DataStorage_UseVirtualSd", Settings::values.use_virtual_sd);
     LogSetting("DataStorage_NandDir", FileUtil::GetUserPath(FileUtil::UserPath::NANDDir));
@@ -127,12 +128,60 @@ void LogSettings() {
     LogSetting("Services_BCATBoxcatLocal", Settings::values.bcat_boxcat_local);
 }
 
+float Volume() {
+    if (values.audio_muted) {
+        return 0.0f;
+    }
+    return values.volume.GetValue();
+}
+
 bool IsGPULevelExtreme() {
-    return values.gpu_accuracy == GPUAccuracy::Extreme;
+    return values.gpu_accuracy.GetValue() == GPUAccuracy::Extreme;
 }
 
 bool IsGPULevelHigh() {
-    return values.gpu_accuracy == GPUAccuracy::Extreme || values.gpu_accuracy == GPUAccuracy::High;
+    return values.gpu_accuracy.GetValue() == GPUAccuracy::Extreme ||
+           values.gpu_accuracy.GetValue() == GPUAccuracy::High;
+}
+
+void RestoreGlobalState() {
+    // If a game is running, DO NOT restore the global settings state
+    if (Core::System::GetInstance().IsPoweredOn()) {
+        return;
+    }
+
+    // Audio
+    values.enable_audio_stretching.SetGlobal(true);
+    values.volume.SetGlobal(true);
+
+    // Core
+    values.use_multi_core.SetGlobal(true);
+
+    // Renderer
+    values.renderer_backend.SetGlobal(true);
+    values.vulkan_device.SetGlobal(true);
+    values.aspect_ratio.SetGlobal(true);
+    values.max_anisotropy.SetGlobal(true);
+    values.use_frame_limit.SetGlobal(true);
+    values.frame_limit.SetGlobal(true);
+    values.use_disk_shader_cache.SetGlobal(true);
+    values.gpu_accuracy.SetGlobal(true);
+    values.use_asynchronous_gpu_emulation.SetGlobal(true);
+    values.use_vsync.SetGlobal(true);
+    values.use_assembly_shaders.SetGlobal(true);
+    values.use_fast_gpu_time.SetGlobal(true);
+    values.force_30fps_mode.SetGlobal(true);
+    values.bg_red.SetGlobal(true);
+    values.bg_green.SetGlobal(true);
+    values.bg_blue.SetGlobal(true);
+
+    // System
+    values.language_index.SetGlobal(true);
+    values.region_index.SetGlobal(true);
+    values.time_zone_index.SetGlobal(true);
+    values.rng_seed.SetGlobal(true);
+    values.custom_rtc.SetGlobal(true);
+    values.sound_index.SetGlobal(true);
 }
 
 } // namespace Settings
diff --git a/src/core/settings.h b/src/core/settings.h
index 78eb33737..850ca4072 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -382,20 +382,85 @@ enum class GPUAccuracy : u32 {
     Extreme = 2,
 };
 
+extern bool configuring_global;
+
+template <typename Type>
+class Setting final {
+public:
+    Setting() = default;
+    explicit Setting(Type val) : global{val} {}
+    ~Setting() = default;
+    void SetGlobal(bool to_global) {
+        use_global = to_global;
+    }
+    bool UsingGlobal() const {
+        return use_global;
+    }
+    Type GetValue(bool need_global = false) const {
+        if (use_global || need_global) {
+            return global;
+        }
+        return local;
+    }
+    void SetValue(const Type& value) {
+        if (use_global) {
+            global = value;
+        } else {
+            local = value;
+        }
+    }
+
+private:
+    bool use_global = true;
+    Type global{};
+    Type local{};
+};
+
 struct Values {
+    // Audio
+    std::string audio_device_id;
+    std::string sink_id;
+    bool audio_muted;
+    Setting<bool> enable_audio_stretching;
+    Setting<float> volume;
+
+    // Core
+    Setting<bool> use_multi_core;
+
+    // Renderer
+    Setting<RendererBackend> renderer_backend;
+    bool renderer_debug;
+    Setting<int> vulkan_device;
+
+    Setting<u16> resolution_factor = Setting(static_cast<u16>(1));
+    Setting<int> aspect_ratio;
+    Setting<int> max_anisotropy;
+    Setting<bool> use_frame_limit;
+    Setting<u16> frame_limit;
+    Setting<bool> use_disk_shader_cache;
+    Setting<GPUAccuracy> gpu_accuracy;
+    Setting<bool> use_asynchronous_gpu_emulation;
+    Setting<bool> use_vsync;
+    Setting<bool> use_assembly_shaders;
+    Setting<bool> force_30fps_mode;
+    Setting<bool> use_fast_gpu_time;
+
+    Setting<float> bg_red;
+    Setting<float> bg_green;
+    Setting<float> bg_blue;
+
     // System
-    bool use_docked_mode;
-    std::optional<u32> rng_seed;
+    Setting<std::optional<u32>> rng_seed;
     // Measured in seconds since epoch
-    std::optional<std::chrono::seconds> custom_rtc;
+    Setting<std::optional<std::chrono::seconds>> custom_rtc;
     // Set on game boot, reset on stop. Seconds difference between current time and `custom_rtc`
     std::chrono::seconds custom_rtc_differential;
 
     s32 current_user;
-    s32 language_index;
-    s32 region_index;
-    s32 time_zone_index;
-    s32 sound_index;
+    Setting<s32> language_index;
+    Setting<s32> region_index;
+    Setting<s32> time_zone_index;
+    Setting<s32> sound_index;
 
     // Controls
     std::array<PlayerInput, 10> players;
@@ -419,8 +484,7 @@ struct Values {
     u16 udp_input_port;
     u8 udp_pad_index;
 
-    // Core
-    bool use_multi_core;
+    bool use_docked_mode;
 
     // Data Storage
     bool use_virtual_sd;
@@ -432,38 +496,6 @@ struct Values {
     NANDUserSize nand_user_size;
     SDMCSize sdmc_size;
 
-    // Renderer
-    RendererBackend renderer_backend;
-    bool renderer_debug;
-    int vulkan_device;
-
-    float resolution_factor;
-    int aspect_ratio;
-    int max_anisotropy;
-    bool use_frame_limit;
-    u16 frame_limit;
-    bool use_disk_shader_cache;
-    GPUAccuracy gpu_accuracy;
-    bool use_asynchronous_gpu_emulation;
-    bool use_vsync;
-    bool use_assembly_shaders;
-    bool force_30fps_mode;
-    bool use_fast_gpu_time;
-
-    float bg_red;
-    float bg_green;
-    float bg_blue;
-
-    std::string log_filter;
-
-    bool use_dev_keys;
-
-    // Audio
-    std::string sink_id;
-    bool enable_audio_stretching;
-    std::string audio_device_id;
-    float volume;
-
     // Debugging
     bool record_frame_times;
     bool use_gdbstub;
@@ -474,8 +506,13 @@ struct Values {
     bool reporting_services;
     bool quest_flag;
     bool disable_cpu_opt;
+    bool disable_macro_jit;
 
-    // BCAT
+    // Misceallaneous
+    std::string log_filter;
+    bool use_dev_keys;
+
+    // Services
     std::string bcat_backend;
     bool bcat_boxcat_local;
 
@@ -489,6 +526,8 @@ struct Values {
     std::map<u64, std::vector<std::string>> disabled_addons;
 } extern values;
 
+float Volume();
+
 bool IsGPULevelExtreme();
 bool IsGPULevelHigh();
 
@@ -497,4 +536,7 @@ std::string GetTimeZoneString();
 void Apply();
 void LogSettings();
 
+// Restore the global state of all applicable settings in the Values struct
+void RestoreGlobalState();
+
 } // namespace Settings
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index c781b3cfc..78915e6db 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -189,19 +189,24 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
     // Log user configuration information
     constexpr auto field_type = Telemetry::FieldType::UserConfig;
     AddField(field_type, "Audio_SinkId", Settings::values.sink_id);
-    AddField(field_type, "Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
-    AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core);
-    AddField(field_type, "Renderer_Backend", TranslateRenderer(Settings::values.renderer_backend));
-    AddField(field_type, "Renderer_ResolutionFactor", Settings::values.resolution_factor);
-    AddField(field_type, "Renderer_UseFrameLimit", Settings::values.use_frame_limit);
-    AddField(field_type, "Renderer_FrameLimit", Settings::values.frame_limit);
-    AddField(field_type, "Renderer_UseDiskShaderCache", Settings::values.use_disk_shader_cache);
+    AddField(field_type, "Audio_EnableAudioStretching",
+             Settings::values.enable_audio_stretching.GetValue());
+    AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core.GetValue());
+    AddField(field_type, "Renderer_Backend",
+             TranslateRenderer(Settings::values.renderer_backend.GetValue()));
+    AddField(field_type, "Renderer_ResolutionFactor",
+             Settings::values.resolution_factor.GetValue());
+    AddField(field_type, "Renderer_UseFrameLimit", Settings::values.use_frame_limit.GetValue());
+    AddField(field_type, "Renderer_FrameLimit", Settings::values.frame_limit.GetValue());
+    AddField(field_type, "Renderer_UseDiskShaderCache",
+             Settings::values.use_disk_shader_cache.GetValue());
     AddField(field_type, "Renderer_GPUAccuracyLevel",
-             TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy));
+             TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));
     AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
-             Settings::values.use_asynchronous_gpu_emulation);
-    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
-    AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
+             Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
+    AddField(field_type, "Renderer_UseAssemblyShaders",
+             Settings::values.use_assembly_shaders.GetValue());
     AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
 }
 
diff --git a/src/core/tools/freezer.cpp b/src/core/tools/freezer.cpp
index b2c6c537e..8b0c50d11 100644
--- a/src/core/tools/freezer.cpp
+++ b/src/core/tools/freezer.cpp
@@ -14,7 +14,7 @@
 namespace Tools {
 namespace {
 
-constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
+constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(1000000000 / 60);
 
 u64 MemoryReadWidth(Core::Memory::Memory& memory, u32 width, VAddr addr) {
     switch (width) {
@@ -57,7 +57,7 @@ Freezer::Freezer(Core::Timing::CoreTiming& core_timing_, Core::Memory::Memory& m
     : core_timing{core_timing_}, memory{memory_} {
     event = Core::Timing::CreateEvent(
         "MemoryFreezer::FrameCallback",
-        [this](u64 userdata, s64 cycles_late) { FrameCallback(userdata, cycles_late); });
+        [this](u64 userdata, s64 ns_late) { FrameCallback(userdata, ns_late); });
     core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS, event);
 }
 
@@ -158,7 +158,7 @@ std::vector<Freezer::Entry> Freezer::GetEntries() const {
     return entries;
 }
 
-void Freezer::FrameCallback(u64 userdata, s64 cycles_late) {
+void Freezer::FrameCallback(u64 userdata, s64 ns_late) {
     if (!IsActive()) {
         LOG_DEBUG(Common_Memory, "Memory freezer has been deactivated, ending callback events.");
         return;
@@ -173,7 +173,7 @@ void Freezer::FrameCallback(u64 userdata, s64 cycles_late) {
         MemoryWriteWidth(memory, entry.width, entry.address, entry.value);
     }
 
-    core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS - cycles_late, event);
+    core_timing.ScheduleEvent(MEMORY_FREEZER_TICKS - ns_late, event);
 }
 
 void Freezer::FillEntryReads() {
diff --git a/src/input_common/CMakeLists.txt b/src/input_common/CMakeLists.txt
index a9c2392b1..3bd76dd23 100644
--- a/src/input_common/CMakeLists.txt
+++ b/src/input_common/CMakeLists.txt
@@ -7,6 +7,10 @@ add_library(input_common STATIC
     main.h
     motion_emu.cpp
     motion_emu.h
+    gcadapter/gc_adapter.cpp
+    gcadapter/gc_adapter.h
+    gcadapter/gc_poller.cpp
+    gcadapter/gc_poller.h
     sdl/sdl.cpp
     sdl/sdl.h
     udp/client.cpp
@@ -26,5 +30,7 @@ if(SDL2_FOUND)
     target_compile_definitions(input_common PRIVATE HAVE_SDL2)
 endif()
 
+target_link_libraries(input_common PUBLIC ${LIBUSB_LIBRARIES})
+
 create_target_directory_groups(input_common)
 target_link_libraries(input_common PUBLIC core PRIVATE common Boost::boost)
diff --git a/src/input_common/gcadapter/gc_adapter.cpp b/src/input_common/gcadapter/gc_adapter.cpp
new file mode 100644
index 000000000..6d9f4d9eb
--- /dev/null
+++ b/src/input_common/gcadapter/gc_adapter.cpp
@@ -0,0 +1,398 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <chrono>
+#include <thread>
+#include "common/logging/log.h"
+#include "input_common/gcadapter/gc_adapter.h"
+
+namespace GCAdapter {
+
+/// Used to loop through and assign button in poller
+constexpr std::array<PadButton, 12> PadButtonArray{
+    PadButton::PAD_BUTTON_LEFT, PadButton::PAD_BUTTON_RIGHT, PadButton::PAD_BUTTON_DOWN,
+    PadButton::PAD_BUTTON_UP,   PadButton::PAD_TRIGGER_Z,    PadButton::PAD_TRIGGER_R,
+    PadButton::PAD_TRIGGER_L,   PadButton::PAD_BUTTON_A,     PadButton::PAD_BUTTON_B,
+    PadButton::PAD_BUTTON_X,    PadButton::PAD_BUTTON_Y,     PadButton::PAD_BUTTON_START,
+};
+
+Adapter::Adapter() {
+    if (usb_adapter_handle != nullptr) {
+        return;
+    }
+    LOG_INFO(Input, "GC Adapter Initialization started");
+
+    current_status = NO_ADAPTER_DETECTED;
+
+    const int init_res = libusb_init(&libusb_ctx);
+    if (init_res == LIBUSB_SUCCESS) {
+        StartScanThread();
+    } else {
+        LOG_ERROR(Input, "libusb could not be initialized. failed with error = {}", init_res);
+    }
+}
+
+GCPadStatus Adapter::GetPadStatus(int port, const std::array<u8, 37>& adapter_payload) {
+    GCPadStatus pad = {};
+    bool get_origin = false;
+
+    ControllerTypes type = ControllerTypes(adapter_payload[1 + (9 * port)] >> 4);
+    if (type != ControllerTypes::None) {
+        get_origin = true;
+    }
+
+    adapter_controllers_status[port] = type;
+
+    static constexpr std::array<PadButton, 8> b1_buttons{
+        PadButton::PAD_BUTTON_A,    PadButton::PAD_BUTTON_B,    PadButton::PAD_BUTTON_X,
+        PadButton::PAD_BUTTON_Y,    PadButton::PAD_BUTTON_LEFT, PadButton::PAD_BUTTON_RIGHT,
+        PadButton::PAD_BUTTON_DOWN, PadButton::PAD_BUTTON_UP,
+    };
+
+    static constexpr std::array<PadButton, 4> b2_buttons{
+        PadButton::PAD_BUTTON_START,
+        PadButton::PAD_TRIGGER_Z,
+        PadButton::PAD_TRIGGER_R,
+        PadButton::PAD_TRIGGER_L,
+    };
+
+    if (adapter_controllers_status[port] != ControllerTypes::None) {
+        const u8 b1 = adapter_payload[1 + (9 * port) + 1];
+        const u8 b2 = adapter_payload[1 + (9 * port) + 2];
+
+        for (std::size_t i = 0; i < b1_buttons.size(); ++i) {
+            if ((b1 & (1U << i)) != 0) {
+                pad.button |= static_cast<u16>(b1_buttons[i]);
+            }
+        }
+
+        for (std::size_t j = 0; j < b2_buttons.size(); ++j) {
+            if ((b2 & (1U << j)) != 0) {
+                pad.button |= static_cast<u16>(b2_buttons[j]);
+            }
+        }
+
+        if (get_origin) {
+            pad.button |= PAD_GET_ORIGIN;
+        }
+
+        pad.stick_x = adapter_payload[1 + (9 * port) + 3];
+        pad.stick_y = adapter_payload[1 + (9 * port) + 4];
+        pad.substick_x = adapter_payload[1 + (9 * port) + 5];
+        pad.substick_y = adapter_payload[1 + (9 * port) + 6];
+        pad.trigger_left = adapter_payload[1 + (9 * port) + 7];
+        pad.trigger_right = adapter_payload[1 + (9 * port) + 8];
+    }
+    return pad;
+}
+
+void Adapter::PadToState(const GCPadStatus& pad, GCState& state) {
+    for (const auto& button : PadButtonArray) {
+        const u16 button_value = static_cast<u16>(button);
+        state.buttons.insert_or_assign(button_value, pad.button & button_value);
+    }
+
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::StickX), pad.stick_x);
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::StickY), pad.stick_y);
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::SubstickX), pad.substick_x);
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::SubstickY), pad.substick_y);
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::TriggerLeft), pad.trigger_left);
+    state.axes.insert_or_assign(static_cast<u8>(PadAxes::TriggerRight), pad.trigger_right);
+}
+
+void Adapter::Read() {
+    LOG_DEBUG(Input, "GC Adapter Read() thread started");
+
+    int payload_size_in, payload_size_copy;
+    std::array<u8, 37> adapter_payload;
+    std::array<u8, 37> adapter_payload_copy;
+    std::array<GCPadStatus, 4> pads;
+
+    while (adapter_thread_running) {
+        libusb_interrupt_transfer(usb_adapter_handle, input_endpoint, adapter_payload.data(),
+                                  sizeof(adapter_payload), &payload_size_in, 16);
+        payload_size_copy = 0;
+        // this mutex might be redundant?
+        {
+            std::lock_guard<std::mutex> lk(s_mutex);
+            std::copy(std::begin(adapter_payload), std::end(adapter_payload),
+                      std::begin(adapter_payload_copy));
+            payload_size_copy = payload_size_in;
+        }
+
+        if (payload_size_copy != sizeof(adapter_payload_copy) ||
+            adapter_payload_copy[0] != LIBUSB_DT_HID) {
+            LOG_ERROR(Input, "error reading payload (size: {}, type: {:02x})", payload_size_copy,
+                      adapter_payload_copy[0]);
+            adapter_thread_running = false; // error reading from adapter, stop reading.
+            break;
+        }
+        for (std::size_t port = 0; port < pads.size(); ++port) {
+            pads[port] = GetPadStatus(port, adapter_payload_copy);
+            if (DeviceConnected(port) && configuring) {
+                if (pads[port].button != PAD_GET_ORIGIN) {
+                    pad_queue[port].Push(pads[port]);
+                }
+
+                // Accounting for a threshold here because of some controller variance
+                if (pads[port].stick_x > pads[port].MAIN_STICK_CENTER_X + pads[port].THRESHOLD ||
+                    pads[port].stick_x < pads[port].MAIN_STICK_CENTER_X - pads[port].THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::StickX;
+                    pads[port].axis_value = pads[port].stick_x;
+                    pad_queue[port].Push(pads[port]);
+                }
+                if (pads[port].stick_y > pads[port].MAIN_STICK_CENTER_Y + pads[port].THRESHOLD ||
+                    pads[port].stick_y < pads[port].MAIN_STICK_CENTER_Y - pads[port].THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::StickY;
+                    pads[port].axis_value = pads[port].stick_y;
+                    pad_queue[port].Push(pads[port]);
+                }
+                if (pads[port].substick_x > pads[port].C_STICK_CENTER_X + pads[port].THRESHOLD ||
+                    pads[port].substick_x < pads[port].C_STICK_CENTER_X - pads[port].THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::SubstickX;
+                    pads[port].axis_value = pads[port].substick_x;
+                    pad_queue[port].Push(pads[port]);
+                }
+                if (pads[port].substick_y > pads[port].C_STICK_CENTER_Y + pads[port].THRESHOLD ||
+                    pads[port].substick_y < pads[port].C_STICK_CENTER_Y - pads[port].THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::SubstickY;
+                    pads[port].axis_value = pads[port].substick_y;
+                    pad_queue[port].Push(pads[port]);
+                }
+                if (pads[port].trigger_left > pads[port].TRIGGER_THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::TriggerLeft;
+                    pads[port].axis_value = pads[port].trigger_left;
+                    pad_queue[port].Push(pads[port]);
+                }
+                if (pads[port].trigger_right > pads[port].TRIGGER_THRESHOLD) {
+                    pads[port].axis = GCAdapter::PadAxes::TriggerRight;
+                    pads[port].axis_value = pads[port].trigger_right;
+                    pad_queue[port].Push(pads[port]);
+                }
+            }
+            PadToState(pads[port], state[port]);
+        }
+        std::this_thread::yield();
+    }
+}
+
+void Adapter::ScanThreadFunc() {
+    LOG_INFO(Input, "GC Adapter scanning thread started");
+
+    while (detect_thread_running) {
+        if (usb_adapter_handle == nullptr) {
+            std::lock_guard<std::mutex> lk(initialization_mutex);
+            Setup();
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    }
+}
+
+void Adapter::StartScanThread() {
+    if (detect_thread_running) {
+        return;
+    }
+    if (!libusb_ctx) {
+        return;
+    }
+
+    detect_thread_running = true;
+    detect_thread = std::thread([=] { ScanThreadFunc(); });
+}
+
+void Adapter::StopScanThread() {
+    detect_thread_running = false;
+    detect_thread.join();
+}
+
+void Adapter::Setup() {
+    // Reset the error status in case the adapter gets unplugged
+    if (current_status < 0) {
+        current_status = NO_ADAPTER_DETECTED;
+    }
+
+    adapter_controllers_status.fill(ControllerTypes::None);
+
+    // pointer to list of connected usb devices
+    libusb_device** devices{};
+
+    // populate the list of devices, get the count
+    const ssize_t device_count = libusb_get_device_list(libusb_ctx, &devices);
+    if (device_count < 0) {
+        LOG_ERROR(Input, "libusb_get_device_list failed with error: {}", device_count);
+        detect_thread_running = false; // Stop the loop constantly checking for gc adapter
+        // TODO: For hotplug+gc adapter checkbox implementation, revert this.
+        return;
+    }
+
+    if (devices != nullptr) {
+        for (std::size_t index = 0; index < device_count; ++index) {
+            if (CheckDeviceAccess(devices[index])) {
+                // GC Adapter found and accessible, registering it
+                GetGCEndpoint(devices[index]);
+                break;
+            }
+        }
+        libusb_free_device_list(devices, 1);
+    }
+}
+
+bool Adapter::CheckDeviceAccess(libusb_device* device) {
+    libusb_device_descriptor desc;
+    const int get_descriptor_error = libusb_get_device_descriptor(device, &desc);
+    if (get_descriptor_error) {
+        // could not acquire the descriptor, no point in trying to use it.
+        LOG_ERROR(Input, "libusb_get_device_descriptor failed with error: {}",
+                  get_descriptor_error);
+        return false;
+    }
+
+    if (desc.idVendor != 0x057e || desc.idProduct != 0x0337) {
+        // This isn't the device we are looking for.
+        return false;
+    }
+    const int open_error = libusb_open(device, &usb_adapter_handle);
+
+    if (open_error == LIBUSB_ERROR_ACCESS) {
+        LOG_ERROR(Input, "Yuzu can not gain access to this device: ID {:04X}:{:04X}.",
+                  desc.idVendor, desc.idProduct);
+        return false;
+    }
+    if (open_error) {
+        LOG_ERROR(Input, "libusb_open failed to open device with error = {}", open_error);
+        return false;
+    }
+
+    int kernel_driver_error = libusb_kernel_driver_active(usb_adapter_handle, 0);
+    if (kernel_driver_error == 1) {
+        kernel_driver_error = libusb_detach_kernel_driver(usb_adapter_handle, 0);
+        if (kernel_driver_error != 0 && kernel_driver_error != LIBUSB_ERROR_NOT_SUPPORTED) {
+            LOG_ERROR(Input, "libusb_detach_kernel_driver failed with error = {}",
+                      kernel_driver_error);
+        }
+    }
+
+    if (kernel_driver_error && kernel_driver_error != LIBUSB_ERROR_NOT_SUPPORTED) {
+        libusb_close(usb_adapter_handle);
+        usb_adapter_handle = nullptr;
+        return false;
+    }
+
+    const int interface_claim_error = libusb_claim_interface(usb_adapter_handle, 0);
+    if (interface_claim_error) {
+        LOG_ERROR(Input, "libusb_claim_interface failed with error = {}", interface_claim_error);
+        libusb_close(usb_adapter_handle);
+        usb_adapter_handle = nullptr;
+        return false;
+    }
+
+    return true;
+}
+
+void Adapter::GetGCEndpoint(libusb_device* device) {
+    libusb_config_descriptor* config = nullptr;
+    const int config_descriptor_return = libusb_get_config_descriptor(device, 0, &config);
+    if (config_descriptor_return != LIBUSB_SUCCESS) {
+        LOG_ERROR(Input, "libusb_get_config_descriptor failed with error = {}",
+                  config_descriptor_return);
+        return;
+    }
+
+    for (u8 ic = 0; ic < config->bNumInterfaces; ic++) {
+        const libusb_interface* interfaceContainer = &config->interface[ic];
+        for (int i = 0; i < interfaceContainer->num_altsetting; i++) {
+            const libusb_interface_descriptor* interface = &interfaceContainer->altsetting[i];
+            for (u8 e = 0; e < interface->bNumEndpoints; e++) {
+                const libusb_endpoint_descriptor* endpoint = &interface->endpoint[e];
+                if (endpoint->bEndpointAddress & LIBUSB_ENDPOINT_IN) {
+                    input_endpoint = endpoint->bEndpointAddress;
+                } else {
+                    output_endpoint = endpoint->bEndpointAddress;
+                }
+            }
+        }
+    }
+    // This transfer seems to be responsible for clearing the state of the adapter
+    // Used to clear the "busy" state of when the device is unexpectedly unplugged
+    unsigned char clear_payload = 0x13;
+    libusb_interrupt_transfer(usb_adapter_handle, output_endpoint, &clear_payload,
+                              sizeof(clear_payload), nullptr, 16);
+
+    adapter_thread_running = true;
+    current_status = ADAPTER_DETECTED;
+    adapter_input_thread = std::thread([=] { Read(); }); // Read input
+}
+
+Adapter::~Adapter() {
+    StopScanThread();
+    Reset();
+}
+
+void Adapter::Reset() {
+    std::unique_lock<std::mutex> lock(initialization_mutex, std::defer_lock);
+    if (!lock.try_lock()) {
+        return;
+    }
+    if (current_status != ADAPTER_DETECTED) {
+        return;
+    }
+
+    if (adapter_thread_running) {
+        adapter_thread_running = false;
+    }
+    adapter_input_thread.join();
+
+    adapter_controllers_status.fill(ControllerTypes::None);
+    current_status = NO_ADAPTER_DETECTED;
+
+    if (usb_adapter_handle) {
+        libusb_release_interface(usb_adapter_handle, 1);
+        libusb_close(usb_adapter_handle);
+        usb_adapter_handle = nullptr;
+    }
+
+    if (libusb_ctx) {
+        libusb_exit(libusb_ctx);
+    }
+}
+
+bool Adapter::DeviceConnected(int port) {
+    return adapter_controllers_status[port] != ControllerTypes::None;
+}
+
+void Adapter::ResetDeviceType(int port) {
+    adapter_controllers_status[port] = ControllerTypes::None;
+}
+
+void Adapter::BeginConfiguration() {
+    for (auto& pq : pad_queue) {
+        pq.Clear();
+    }
+    configuring = true;
+}
+
+void Adapter::EndConfiguration() {
+    for (auto& pq : pad_queue) {
+        pq.Clear();
+    }
+    configuring = false;
+}
+
+std::array<Common::SPSCQueue<GCPadStatus>, 4>& Adapter::GetPadQueue() {
+    return pad_queue;
+}
+
+const std::array<Common::SPSCQueue<GCPadStatus>, 4>& Adapter::GetPadQueue() const {
+    return pad_queue;
+}
+
+std::array<GCState, 4>& Adapter::GetPadState() {
+    return state;
+}
+
+const std::array<GCState, 4>& Adapter::GetPadState() const {
+    return state;
+}
+
+} // namespace GCAdapter
diff --git a/src/input_common/gcadapter/gc_adapter.h b/src/input_common/gcadapter/gc_adapter.h
new file mode 100644
index 000000000..b1c2a1958
--- /dev/null
+++ b/src/input_common/gcadapter/gc_adapter.h
@@ -0,0 +1,161 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+#include <algorithm>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <libusb.h>
+#include "common/common_types.h"
+#include "common/threadsafe_queue.h"
+
+namespace GCAdapter {
+
+enum {
+    PAD_USE_ORIGIN = 0x0080,
+    PAD_GET_ORIGIN = 0x2000,
+    PAD_ERR_STATUS = 0x8000,
+};
+
+enum class PadButton {
+    PAD_BUTTON_LEFT = 0x0001,
+    PAD_BUTTON_RIGHT = 0x0002,
+    PAD_BUTTON_DOWN = 0x0004,
+    PAD_BUTTON_UP = 0x0008,
+    PAD_TRIGGER_Z = 0x0010,
+    PAD_TRIGGER_R = 0x0020,
+    PAD_TRIGGER_L = 0x0040,
+    PAD_BUTTON_A = 0x0100,
+    PAD_BUTTON_B = 0x0200,
+    PAD_BUTTON_X = 0x0400,
+    PAD_BUTTON_Y = 0x0800,
+    PAD_BUTTON_START = 0x1000,
+    // Below is for compatibility with "AxisButton" type
+    PAD_STICK = 0x2000,
+};
+
+extern const std::array<PadButton, 12> PadButtonArray;
+
+enum class PadAxes : u8 {
+    StickX,
+    StickY,
+    SubstickX,
+    SubstickY,
+    TriggerLeft,
+    TriggerRight,
+    Undefined,
+};
+
+struct GCPadStatus {
+    u16 button{};       // Or-ed PAD_BUTTON_* and PAD_TRIGGER_* bits
+    u8 stick_x{};       // 0 <= stick_x       <= 255
+    u8 stick_y{};       // 0 <= stick_y       <= 255
+    u8 substick_x{};    // 0 <= substick_x    <= 255
+    u8 substick_y{};    // 0 <= substick_y    <= 255
+    u8 trigger_left{};  // 0 <= trigger_left  <= 255
+    u8 trigger_right{}; // 0 <= trigger_right <= 255
+
+    static constexpr u8 MAIN_STICK_CENTER_X = 0x80;
+    static constexpr u8 MAIN_STICK_CENTER_Y = 0x80;
+    static constexpr u8 MAIN_STICK_RADIUS = 0x7f;
+    static constexpr u8 C_STICK_CENTER_X = 0x80;
+    static constexpr u8 C_STICK_CENTER_Y = 0x80;
+    static constexpr u8 C_STICK_RADIUS = 0x7f;
+    static constexpr u8 THRESHOLD = 10;
+
+    // 256/4, at least a quarter press to count as a press. For polling mostly
+    static constexpr u8 TRIGGER_THRESHOLD = 64;
+
+    u8 port{};
+    PadAxes axis{PadAxes::Undefined};
+    u8 axis_value{255};
+};
+
+struct GCState {
+    std::unordered_map<int, bool> buttons;
+    std::unordered_map<int, u16> axes;
+};
+
+enum class ControllerTypes { None, Wired, Wireless };
+
+enum {
+    NO_ADAPTER_DETECTED = 0,
+    ADAPTER_DETECTED = 1,
+};
+
+class Adapter {
+public:
+    /// Initialize the GC Adapter capture and read sequence
+    Adapter();
+
+    /// Close the adapter read thread and release the adapter
+    ~Adapter();
+    /// Used for polling
+    void BeginConfiguration();
+    void EndConfiguration();
+
+    std::array<Common::SPSCQueue<GCPadStatus>, 4>& GetPadQueue();
+    const std::array<Common::SPSCQueue<GCPadStatus>, 4>& GetPadQueue() const;
+
+    std::array<GCState, 4>& GetPadState();
+    const std::array<GCState, 4>& GetPadState() const;
+
+private:
+    GCPadStatus GetPadStatus(int port, const std::array<u8, 37>& adapter_payload);
+
+    void PadToState(const GCPadStatus& pad, GCState& state);
+
+    void Read();
+    void ScanThreadFunc();
+    /// Begin scanning for the GC Adapter.
+    void StartScanThread();
+
+    /// Stop scanning for the adapter
+    void StopScanThread();
+
+    /// Returns true if there is a device connected to port
+    bool DeviceConnected(int port);
+
+    /// Resets status of device connected to port
+    void ResetDeviceType(int port);
+
+    /// Returns true if we successfully gain access to GC Adapter
+    bool CheckDeviceAccess(libusb_device* device);
+
+    /// Captures GC Adapter endpoint address,
+    void GetGCEndpoint(libusb_device* device);
+
+    /// For shutting down, clear all data, join all threads, release usb
+    void Reset();
+
+    /// For use in initialization, querying devices to find the adapter
+    void Setup();
+
+    int current_status = NO_ADAPTER_DETECTED;
+    libusb_device_handle* usb_adapter_handle = nullptr;
+    std::array<ControllerTypes, 4> adapter_controllers_status{};
+
+    std::mutex s_mutex;
+
+    std::thread adapter_input_thread;
+    bool adapter_thread_running;
+
+    std::mutex initialization_mutex;
+    std::thread detect_thread;
+    bool detect_thread_running = false;
+
+    libusb_context* libusb_ctx;
+
+    u8 input_endpoint = 0;
+    u8 output_endpoint = 0;
+
+    bool configuring = false;
+
+    std::array<Common::SPSCQueue<GCPadStatus>, 4> pad_queue;
+    std::array<GCState, 4> state;
+};
+
+} // namespace GCAdapter
diff --git a/src/input_common/gcadapter/gc_poller.cpp b/src/input_common/gcadapter/gc_poller.cpp
new file mode 100644
index 000000000..385ce8430
--- /dev/null
+++ b/src/input_common/gcadapter/gc_poller.cpp
@@ -0,0 +1,272 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <atomic>
+#include <list>
+#include <mutex>
+#include <utility>
+#include "common/threadsafe_queue.h"
+#include "input_common/gcadapter/gc_adapter.h"
+#include "input_common/gcadapter/gc_poller.h"
+
+namespace InputCommon {
+
+class GCButton final : public Input::ButtonDevice {
+public:
+    explicit GCButton(int port_, int button_, GCAdapter::Adapter* adapter)
+        : port(port_), button(button_), gcadapter(adapter) {}
+
+    ~GCButton() override;
+
+    bool GetStatus() const override {
+        return gcadapter->GetPadState()[port].buttons.at(button);
+    }
+
+private:
+    const int port;
+    const int button;
+    GCAdapter::Adapter* gcadapter;
+};
+
+class GCAxisButton final : public Input::ButtonDevice {
+public:
+    explicit GCAxisButton(int port_, int axis_, float threshold_, bool trigger_if_greater_,
+                          GCAdapter::Adapter* adapter)
+        : port(port_), axis(axis_), threshold(threshold_), trigger_if_greater(trigger_if_greater_),
+          gcadapter(adapter) {
+        // L/R triggers range is only in positive direction beginning near 0
+        // 0.0 threshold equates to near half trigger press, but threshold accounts for variability.
+        if (axis > 3) {
+            threshold *= -0.5;
+        }
+    }
+
+    bool GetStatus() const override {
+        const float axis_value = (gcadapter->GetPadState()[port].axes.at(axis) - 128.0f) / 128.0f;
+        if (trigger_if_greater) {
+            // TODO: Might be worthwile to set a slider for the trigger threshold. It is currently
+            // always set to 0.5 in configure_input_player.cpp ZL/ZR HandleClick
+            return axis_value > threshold;
+        }
+        return axis_value < -threshold;
+    }
+
+private:
+    const int port;
+    const int axis;
+    float threshold;
+    bool trigger_if_greater;
+    GCAdapter::Adapter* gcadapter;
+};
+
+GCButtonFactory::GCButtonFactory(std::shared_ptr<GCAdapter::Adapter> adapter_)
+    : adapter(std::move(adapter_)) {}
+
+GCButton::~GCButton() = default;
+
+std::unique_ptr<Input::ButtonDevice> GCButtonFactory::Create(const Common::ParamPackage& params) {
+    const int button_id = params.Get("button", 0);
+    const int port = params.Get("port", 0);
+
+    constexpr int PAD_STICK_ID = static_cast<u16>(GCAdapter::PadButton::PAD_STICK);
+
+    // button is not an axis/stick button
+    if (button_id != PAD_STICK_ID) {
+        auto button = std::make_unique<GCButton>(port, button_id, adapter.get());
+        return std::move(button);
+    }
+
+    // For Axis buttons, used by the binary sticks.
+    if (button_id == PAD_STICK_ID) {
+        const int axis = params.Get("axis", 0);
+        const float threshold = params.Get("threshold", 0.25f);
+        const std::string direction_name = params.Get("direction", "");
+        bool trigger_if_greater;
+        if (direction_name == "+") {
+            trigger_if_greater = true;
+        } else if (direction_name == "-") {
+            trigger_if_greater = false;
+        } else {
+            trigger_if_greater = true;
+            LOG_ERROR(Input, "Unknown direction {}", direction_name);
+        }
+        return std::make_unique<GCAxisButton>(port, axis, threshold, trigger_if_greater,
+                                              adapter.get());
+    }
+}
+
+Common::ParamPackage GCButtonFactory::GetNextInput() {
+    Common::ParamPackage params;
+    GCAdapter::GCPadStatus pad;
+    auto& queue = adapter->GetPadQueue();
+    for (std::size_t port = 0; port < queue.size(); ++port) {
+        while (queue[port].Pop(pad)) {
+            // This while loop will break on the earliest detected button
+            params.Set("engine", "gcpad");
+            params.Set("port", static_cast<int>(port));
+            for (const auto& button : GCAdapter::PadButtonArray) {
+                const u16 button_value = static_cast<u16>(button);
+                if (pad.button & button_value) {
+                    params.Set("button", button_value);
+                    break;
+                }
+            }
+
+            // For Axis button implementation
+            if (pad.axis != GCAdapter::PadAxes::Undefined) {
+                params.Set("axis", static_cast<u8>(pad.axis));
+                params.Set("button", static_cast<u16>(GCAdapter::PadButton::PAD_STICK));
+                if (pad.axis_value > 128) {
+                    params.Set("direction", "+");
+                    params.Set("threshold", "0.25");
+                } else {
+                    params.Set("direction", "-");
+                    params.Set("threshold", "-0.25");
+                }
+                break;
+            }
+        }
+    }
+    return params;
+}
+
+void GCButtonFactory::BeginConfiguration() {
+    polling = true;
+    adapter->BeginConfiguration();
+}
+
+void GCButtonFactory::EndConfiguration() {
+    polling = false;
+    adapter->EndConfiguration();
+}
+
+class GCAnalog final : public Input::AnalogDevice {
+public:
+    GCAnalog(int port_, int axis_x_, int axis_y_, float deadzone_, GCAdapter::Adapter* adapter)
+        : port(port_), axis_x(axis_x_), axis_y(axis_y_), deadzone(deadzone_), gcadapter(adapter) {}
+
+    float GetAxis(int axis) const {
+        std::lock_guard lock{mutex};
+        // division is not by a perfect 128 to account for some variance in center location
+        // e.g. my device idled at 131 in X, 120 in Y, and full range of motion was in range
+        // [20-230]
+        return (gcadapter->GetPadState()[port].axes.at(axis) - 128.0f) / 95.0f;
+    }
+
+    std::pair<float, float> GetAnalog(int axis_x, int axis_y) const {
+        float x = GetAxis(axis_x);
+        float y = GetAxis(axis_y);
+
+        // Make sure the coordinates are in the unit circle,
+        // otherwise normalize it.
+        float r = x * x + y * y;
+        if (r > 1.0f) {
+            r = std::sqrt(r);
+            x /= r;
+            y /= r;
+        }
+
+        return {x, y};
+    }
+
+    std::tuple<float, float> GetStatus() const override {
+        const auto [x, y] = GetAnalog(axis_x, axis_y);
+        const float r = std::sqrt((x * x) + (y * y));
+        if (r > deadzone) {
+            return {x / r * (r - deadzone) / (1 - deadzone),
+                    y / r * (r - deadzone) / (1 - deadzone)};
+        }
+        return {0.0f, 0.0f};
+    }
+
+    bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override {
+        const auto [x, y] = GetStatus();
+        const float directional_deadzone = 0.4f;
+        switch (direction) {
+        case Input::AnalogDirection::RIGHT:
+            return x > directional_deadzone;
+        case Input::AnalogDirection::LEFT:
+            return x < -directional_deadzone;
+        case Input::AnalogDirection::UP:
+            return y > directional_deadzone;
+        case Input::AnalogDirection::DOWN:
+            return y < -directional_deadzone;
+        }
+        return false;
+    }
+
+private:
+    const int port;
+    const int axis_x;
+    const int axis_y;
+    const float deadzone;
+    mutable std::mutex mutex;
+    GCAdapter::Adapter* gcadapter;
+};
+
+/// An analog device factory that creates analog devices from GC Adapter
+GCAnalogFactory::GCAnalogFactory(std::shared_ptr<GCAdapter::Adapter> adapter_)
+    : adapter(std::move(adapter_)) {}
+
+/**
+ * Creates analog device from joystick axes
+ * @param params contains parameters for creating the device:
+ *     - "port": the nth gcpad on the adapter
+ *     - "axis_x": the index of the axis to be bind as x-axis
+ *     - "axis_y": the index of the axis to be bind as y-axis
+ */
+std::unique_ptr<Input::AnalogDevice> GCAnalogFactory::Create(const Common::ParamPackage& params) {
+    const int port = params.Get("port", 0);
+    const int axis_x = params.Get("axis_x", 0);
+    const int axis_y = params.Get("axis_y", 1);
+    const float deadzone = std::clamp(params.Get("deadzone", 0.0f), 0.0f, .99f);
+
+    return std::make_unique<GCAnalog>(port, axis_x, axis_y, deadzone, adapter.get());
+}
+
+void GCAnalogFactory::BeginConfiguration() {
+    polling = true;
+    adapter->BeginConfiguration();
+}
+
+void GCAnalogFactory::EndConfiguration() {
+    polling = false;
+    adapter->EndConfiguration();
+}
+
+Common::ParamPackage GCAnalogFactory::GetNextInput() {
+    GCAdapter::GCPadStatus pad;
+    auto& queue = adapter->GetPadQueue();
+    for (std::size_t port = 0; port < queue.size(); ++port) {
+        while (queue[port].Pop(pad)) {
+            if (pad.axis == GCAdapter::PadAxes::Undefined ||
+                std::abs((pad.axis_value - 128.0f) / 128.0f) < 0.1) {
+                continue;
+            }
+            // An analog device needs two axes, so we need to store the axis for later and wait for
+            // a second input event. The axes also must be from the same joystick.
+            const u8 axis = static_cast<u8>(pad.axis);
+            if (analog_x_axis == -1) {
+                analog_x_axis = axis;
+                controller_number = port;
+            } else if (analog_y_axis == -1 && analog_x_axis != axis && controller_number == port) {
+                analog_y_axis = axis;
+            }
+        }
+    }
+    Common::ParamPackage params;
+    if (analog_x_axis != -1 && analog_y_axis != -1) {
+        params.Set("engine", "gcpad");
+        params.Set("port", controller_number);
+        params.Set("axis_x", analog_x_axis);
+        params.Set("axis_y", analog_y_axis);
+        analog_x_axis = -1;
+        analog_y_axis = -1;
+        controller_number = -1;
+        return params;
+    }
+    return params;
+}
+
+} // namespace InputCommon
diff --git a/src/input_common/gcadapter/gc_poller.h b/src/input_common/gcadapter/gc_poller.h
new file mode 100644
index 000000000..e96af7d51
--- /dev/null
+++ b/src/input_common/gcadapter/gc_poller.h
@@ -0,0 +1,67 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "core/frontend/input.h"
+#include "input_common/gcadapter/gc_adapter.h"
+
+namespace InputCommon {
+
+/**
+ * A button device factory representing a gcpad. It receives gcpad events and forward them
+ * to all button devices it created.
+ */
+class GCButtonFactory final : public Input::Factory<Input::ButtonDevice> {
+public:
+    explicit GCButtonFactory(std::shared_ptr<GCAdapter::Adapter> adapter_);
+
+    /**
+     * Creates a button device from a button press
+     * @param params contains parameters for creating the device:
+     *     - "code": the code of the key to bind with the button
+     */
+    std::unique_ptr<Input::ButtonDevice> Create(const Common::ParamPackage& params) override;
+
+    Common::ParamPackage GetNextInput();
+
+    /// For device input configuration/polling
+    void BeginConfiguration();
+    void EndConfiguration();
+
+    bool IsPolling() const {
+        return polling;
+    }
+
+private:
+    std::shared_ptr<GCAdapter::Adapter> adapter;
+    bool polling = false;
+};
+
+/// An analog device factory that creates analog devices from GC Adapter
+class GCAnalogFactory final : public Input::Factory<Input::AnalogDevice> {
+public:
+    explicit GCAnalogFactory(std::shared_ptr<GCAdapter::Adapter> adapter_);
+
+    std::unique_ptr<Input::AnalogDevice> Create(const Common::ParamPackage& params) override;
+    Common::ParamPackage GetNextInput();
+
+    /// For device input configuration/polling
+    void BeginConfiguration();
+    void EndConfiguration();
+
+    bool IsPolling() const {
+        return polling;
+    }
+
+private:
+    std::shared_ptr<GCAdapter::Adapter> adapter;
+    int analog_x_axis = -1;
+    int analog_y_axis = -1;
+    int controller_number = -1;
+    bool polling = false;
+};
+
+} // namespace InputCommon
diff --git a/src/input_common/keyboard.cpp b/src/input_common/keyboard.cpp
index 078374be5..afb8e6612 100644
--- a/src/input_common/keyboard.cpp
+++ b/src/input_common/keyboard.cpp
@@ -76,7 +76,7 @@ std::unique_ptr<Input::ButtonDevice> Keyboard::Create(const Common::ParamPackage
     int key_code = params.Get("code", 0);
     std::unique_ptr<KeyButton> button = std::make_unique<KeyButton>(key_button_list);
     key_button_list->AddKeyButton(key_code, button.get());
-    return std::move(button);
+    return button;
 }
 
 void Keyboard::PressKey(int key_code) {
diff --git a/src/input_common/main.cpp b/src/input_common/main.cpp
index 95e351e24..fd0af1019 100644
--- a/src/input_common/main.cpp
+++ b/src/input_common/main.cpp
@@ -4,8 +4,11 @@
 
 #include <memory>
 #include <thread>
+#include <libusb.h>
 #include "common/param_package.h"
 #include "input_common/analog_from_button.h"
+#include "input_common/gcadapter/gc_adapter.h"
+#include "input_common/gcadapter/gc_poller.h"
 #include "input_common/keyboard.h"
 #include "input_common/main.h"
 #include "input_common/motion_emu.h"
@@ -22,8 +25,16 @@ static std::shared_ptr<MotionEmu> motion_emu;
 static std::unique_ptr<SDL::State> sdl;
 #endif
 static std::unique_ptr<CemuhookUDP::State> udp;
+static std::shared_ptr<GCButtonFactory> gcbuttons;
+static std::shared_ptr<GCAnalogFactory> gcanalog;
 
 void Init() {
+    auto gcadapter = std::make_shared<GCAdapter::Adapter>();
+    gcbuttons = std::make_shared<GCButtonFactory>(gcadapter);
+    Input::RegisterFactory<Input::ButtonDevice>("gcpad", gcbuttons);
+    gcanalog = std::make_shared<GCAnalogFactory>(gcadapter);
+    Input::RegisterFactory<Input::AnalogDevice>("gcpad", gcanalog);
+
     keyboard = std::make_shared<Keyboard>();
     Input::RegisterFactory<Input::ButtonDevice>("keyboard", keyboard);
     Input::RegisterFactory<Input::AnalogDevice>("analog_from_button",
@@ -48,6 +59,11 @@ void Shutdown() {
     sdl.reset();
 #endif
     udp.reset();
+    Input::UnregisterFactory<Input::ButtonDevice>("gcpad");
+    Input::UnregisterFactory<Input::AnalogDevice>("gcpad");
+
+    gcbuttons.reset();
+    gcanalog.reset();
 }
 
 Keyboard* GetKeyboard() {
@@ -58,6 +74,14 @@ MotionEmu* GetMotionEmu() {
     return motion_emu.get();
 }
 
+GCButtonFactory* GetGCButtons() {
+    return gcbuttons.get();
+}
+
+GCAnalogFactory* GetGCAnalogs() {
+    return gcanalog.get();
+}
+
 std::string GenerateKeyboardParam(int key_code) {
     Common::ParamPackage param{
         {"engine", "keyboard"},
diff --git a/src/input_common/main.h b/src/input_common/main.h
index 77a0ce90b..0e32856f6 100644
--- a/src/input_common/main.h
+++ b/src/input_common/main.h
@@ -7,6 +7,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "input_common/gcadapter/gc_poller.h"
 
 namespace Common {
 class ParamPackage;
@@ -30,6 +31,10 @@ class MotionEmu;
 /// Gets the motion emulation factory.
 MotionEmu* GetMotionEmu();
 
+GCButtonFactory* GetGCButtons();
+
+GCAnalogFactory* GetGCAnalogs();
+
 /// Generates a serialized param package for creating a keyboard button device
 std::string GenerateKeyboardParam(int key_code);
 
diff --git a/src/input_common/motion_emu.cpp b/src/input_common/motion_emu.cpp
index 868251628..d4cdf76a3 100644
--- a/src/input_common/motion_emu.cpp
+++ b/src/input_common/motion_emu.cpp
@@ -145,7 +145,7 @@ std::unique_ptr<Input::MotionDevice> MotionEmu::Create(const Common::ParamPackag
     // Previously created device is disconnected here. Having two motion devices for 3DS is not
     // expected.
     current_device = device_wrapper->device;
-    return std::move(device_wrapper);
+    return device_wrapper;
 }
 
 void MotionEmu::BeginTilt(int x, int y) {
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index c7038b217..47ef30aa9 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_executable(tests
     common/bit_field.cpp
     common/bit_utils.cpp
+    common/fibers.cpp
     common/multi_level_queue.cpp
     common/param_package.cpp
     common/ring_buffer.cpp
diff --git a/src/tests/common/fibers.cpp b/src/tests/common/fibers.cpp
new file mode 100644
index 000000000..4fd92428f
--- /dev/null
+++ b/src/tests/common/fibers.cpp
@@ -0,0 +1,358 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include <catch2/catch.hpp>
+#include <math.h>
+#include "common/common_types.h"
+#include "common/fiber.h"
+#include "common/spin_lock.h"
+
+namespace Common {
+
+class TestControl1 {
+public:
+    TestControl1() = default;
+
+    void DoWork();
+
+    void ExecuteThread(u32 id);
+
+    std::unordered_map<std::thread::id, u32> ids;
+    std::vector<std::shared_ptr<Common::Fiber>> thread_fibers;
+    std::vector<std::shared_ptr<Common::Fiber>> work_fibers;
+    std::vector<u32> items;
+    std::vector<u32> results;
+};
+
+static void WorkControl1(void* control) {
+    auto* test_control = static_cast<TestControl1*>(control);
+    test_control->DoWork();
+}
+
+void TestControl1::DoWork() {
+    std::thread::id this_id = std::this_thread::get_id();
+    u32 id = ids[this_id];
+    u32 value = items[id];
+    for (u32 i = 0; i < id; i++) {
+        value++;
+    }
+    results[id] = value;
+    Fiber::YieldTo(work_fibers[id], thread_fibers[id]);
+}
+
+void TestControl1::ExecuteThread(u32 id) {
+    std::thread::id this_id = std::this_thread::get_id();
+    ids[this_id] = id;
+    auto thread_fiber = Fiber::ThreadToFiber();
+    thread_fibers[id] = thread_fiber;
+    work_fibers[id] = std::make_shared<Fiber>(std::function<void(void*)>{WorkControl1}, this);
+    items[id] = rand() % 256;
+    Fiber::YieldTo(thread_fibers[id], work_fibers[id]);
+    thread_fibers[id]->Exit();
+}
+
+static void ThreadStart1(u32 id, TestControl1& test_control) {
+    test_control.ExecuteThread(id);
+}
+
+/** This test checks for fiber setup configuration and validates that fibers are
+ *  doing all the work required.
+ */
+TEST_CASE("Fibers::Setup", "[common]") {
+    constexpr std::size_t num_threads = 7;
+    TestControl1 test_control{};
+    test_control.thread_fibers.resize(num_threads);
+    test_control.work_fibers.resize(num_threads);
+    test_control.items.resize(num_threads, 0);
+    test_control.results.resize(num_threads, 0);
+    std::vector<std::thread> threads;
+    for (u32 i = 0; i < num_threads; i++) {
+        threads.emplace_back(ThreadStart1, i, std::ref(test_control));
+    }
+    for (u32 i = 0; i < num_threads; i++) {
+        threads[i].join();
+    }
+    for (u32 i = 0; i < num_threads; i++) {
+        REQUIRE(test_control.items[i] + i == test_control.results[i]);
+    }
+}
+
+class TestControl2 {
+public:
+    TestControl2() = default;
+
+    void DoWork1() {
+        trap2 = false;
+        while (trap.load())
+            ;
+        for (u32 i = 0; i < 12000; i++) {
+            value1 += i;
+        }
+        Fiber::YieldTo(fiber1, fiber3);
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        assert1 = id == 1;
+        value2 += 5000;
+        Fiber::YieldTo(fiber1, thread_fibers[id]);
+    }
+
+    void DoWork2() {
+        while (trap2.load())
+            ;
+        value2 = 2000;
+        trap = false;
+        Fiber::YieldTo(fiber2, fiber1);
+        assert3 = false;
+    }
+
+    void DoWork3() {
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        assert2 = id == 0;
+        value1 += 1000;
+        Fiber::YieldTo(fiber3, thread_fibers[id]);
+    }
+
+    void ExecuteThread(u32 id);
+
+    void CallFiber1() {
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        Fiber::YieldTo(thread_fibers[id], fiber1);
+    }
+
+    void CallFiber2() {
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        Fiber::YieldTo(thread_fibers[id], fiber2);
+    }
+
+    void Exit();
+
+    bool assert1{};
+    bool assert2{};
+    bool assert3{true};
+    u32 value1{};
+    u32 value2{};
+    std::atomic<bool> trap{true};
+    std::atomic<bool> trap2{true};
+    std::unordered_map<std::thread::id, u32> ids;
+    std::vector<std::shared_ptr<Common::Fiber>> thread_fibers;
+    std::shared_ptr<Common::Fiber> fiber1;
+    std::shared_ptr<Common::Fiber> fiber2;
+    std::shared_ptr<Common::Fiber> fiber3;
+};
+
+static void WorkControl2_1(void* control) {
+    auto* test_control = static_cast<TestControl2*>(control);
+    test_control->DoWork1();
+}
+
+static void WorkControl2_2(void* control) {
+    auto* test_control = static_cast<TestControl2*>(control);
+    test_control->DoWork2();
+}
+
+static void WorkControl2_3(void* control) {
+    auto* test_control = static_cast<TestControl2*>(control);
+    test_control->DoWork3();
+}
+
+void TestControl2::ExecuteThread(u32 id) {
+    std::thread::id this_id = std::this_thread::get_id();
+    ids[this_id] = id;
+    auto thread_fiber = Fiber::ThreadToFiber();
+    thread_fibers[id] = thread_fiber;
+}
+
+void TestControl2::Exit() {
+    std::thread::id this_id = std::this_thread::get_id();
+    u32 id = ids[this_id];
+    thread_fibers[id]->Exit();
+}
+
+static void ThreadStart2_1(u32 id, TestControl2& test_control) {
+    test_control.ExecuteThread(id);
+    test_control.CallFiber1();
+    test_control.Exit();
+}
+
+static void ThreadStart2_2(u32 id, TestControl2& test_control) {
+    test_control.ExecuteThread(id);
+    test_control.CallFiber2();
+    test_control.Exit();
+}
+
+/** This test checks for fiber thread exchange configuration and validates that fibers are
+ *  that a fiber has been succesfully transfered from one thread to another and that the TLS
+ *  region of the thread is kept while changing fibers.
+ */
+TEST_CASE("Fibers::InterExchange", "[common]") {
+    TestControl2 test_control{};
+    test_control.thread_fibers.resize(2);
+    test_control.fiber1 =
+        std::make_shared<Fiber>(std::function<void(void*)>{WorkControl2_1}, &test_control);
+    test_control.fiber2 =
+        std::make_shared<Fiber>(std::function<void(void*)>{WorkControl2_2}, &test_control);
+    test_control.fiber3 =
+        std::make_shared<Fiber>(std::function<void(void*)>{WorkControl2_3}, &test_control);
+    std::thread thread1(ThreadStart2_1, 0, std::ref(test_control));
+    std::thread thread2(ThreadStart2_2, 1, std::ref(test_control));
+    thread1.join();
+    thread2.join();
+    REQUIRE(test_control.assert1);
+    REQUIRE(test_control.assert2);
+    REQUIRE(test_control.assert3);
+    REQUIRE(test_control.value2 == 7000);
+    u32 cal_value = 0;
+    for (u32 i = 0; i < 12000; i++) {
+        cal_value += i;
+    }
+    cal_value += 1000;
+    REQUIRE(test_control.value1 == cal_value);
+}
+
+class TestControl3 {
+public:
+    TestControl3() = default;
+
+    void DoWork1() {
+        value1 += 1;
+        Fiber::YieldTo(fiber1, fiber2);
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        value3 += 1;
+        Fiber::YieldTo(fiber1, thread_fibers[id]);
+    }
+
+    void DoWork2() {
+        value2 += 1;
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        Fiber::YieldTo(fiber2, thread_fibers[id]);
+    }
+
+    void ExecuteThread(u32 id);
+
+    void CallFiber1() {
+        std::thread::id this_id = std::this_thread::get_id();
+        u32 id = ids[this_id];
+        Fiber::YieldTo(thread_fibers[id], fiber1);
+    }
+
+    void Exit();
+
+    u32 value1{};
+    u32 value2{};
+    u32 value3{};
+    std::unordered_map<std::thread::id, u32> ids;
+    std::vector<std::shared_ptr<Common::Fiber>> thread_fibers;
+    std::shared_ptr<Common::Fiber> fiber1;
+    std::shared_ptr<Common::Fiber> fiber2;
+};
+
+static void WorkControl3_1(void* control) {
+    auto* test_control = static_cast<TestControl3*>(control);
+    test_control->DoWork1();
+}
+
+static void WorkControl3_2(void* control) {
+    auto* test_control = static_cast<TestControl3*>(control);
+    test_control->DoWork2();
+}
+
+void TestControl3::ExecuteThread(u32 id) {
+    std::thread::id this_id = std::this_thread::get_id();
+    ids[this_id] = id;
+    auto thread_fiber = Fiber::ThreadToFiber();
+    thread_fibers[id] = thread_fiber;
+}
+
+void TestControl3::Exit() {
+    std::thread::id this_id = std::this_thread::get_id();
+    u32 id = ids[this_id];
+    thread_fibers[id]->Exit();
+}
+
+static void ThreadStart3(u32 id, TestControl3& test_control) {
+    test_control.ExecuteThread(id);
+    test_control.CallFiber1();
+    test_control.Exit();
+}
+
+/** This test checks for one two threads racing for starting the same fiber.
+ *  It checks execution occured in an ordered manner and by no time there were
+ *  two contexts at the same time.
+ */
+TEST_CASE("Fibers::StartRace", "[common]") {
+    TestControl3 test_control{};
+    test_control.thread_fibers.resize(2);
+    test_control.fiber1 =
+        std::make_shared<Fiber>(std::function<void(void*)>{WorkControl3_1}, &test_control);
+    test_control.fiber2 =
+        std::make_shared<Fiber>(std::function<void(void*)>{WorkControl3_2}, &test_control);
+    std::thread thread1(ThreadStart3, 0, std::ref(test_control));
+    std::thread thread2(ThreadStart3, 1, std::ref(test_control));
+    thread1.join();
+    thread2.join();
+    REQUIRE(test_control.value1 == 1);
+    REQUIRE(test_control.value2 == 1);
+    REQUIRE(test_control.value3 == 1);
+}
+
+class TestControl4;
+
+static void WorkControl4(void* control);
+
+class TestControl4 {
+public:
+    TestControl4() {
+        fiber1 = std::make_shared<Fiber>(std::function<void(void*)>{WorkControl4}, this);
+        goal_reached = false;
+        rewinded = false;
+    }
+
+    void Execute() {
+        thread_fiber = Fiber::ThreadToFiber();
+        Fiber::YieldTo(thread_fiber, fiber1);
+        thread_fiber->Exit();
+    }
+
+    void DoWork() {
+        fiber1->SetRewindPoint(std::function<void(void*)>{WorkControl4}, this);
+        if (rewinded) {
+            goal_reached = true;
+            Fiber::YieldTo(fiber1, thread_fiber);
+        }
+        rewinded = true;
+        fiber1->Rewind();
+    }
+
+    std::shared_ptr<Common::Fiber> fiber1;
+    std::shared_ptr<Common::Fiber> thread_fiber;
+    bool goal_reached;
+    bool rewinded;
+};
+
+static void WorkControl4(void* control) {
+    auto* test_control = static_cast<TestControl4*>(control);
+    test_control->DoWork();
+}
+
+TEST_CASE("Fibers::Rewind", "[common]") {
+    TestControl4 test_control{};
+    test_control.Execute();
+    REQUIRE(test_control.goal_reached);
+    REQUIRE(test_control.rewinded);
+}
+
+} // namespace Common
diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp
index ff2d11cc8..e66db1940 100644
--- a/src/tests/core/core_timing.cpp
+++ b/src/tests/core/core_timing.cpp
@@ -18,29 +18,26 @@ namespace {
 // Numbers are chosen randomly to make sure the correct one is given.
 constexpr std::array<u64, 5> CB_IDS{{42, 144, 93, 1026, UINT64_C(0xFFFF7FFFF7FFFF)}};
 constexpr int MAX_SLICE_LENGTH = 10000; // Copied from CoreTiming internals
+constexpr std::array<u64, 5> calls_order{{2, 0, 1, 4, 3}};
+std::array<s64, 5> delays{};
 
 std::bitset<CB_IDS.size()> callbacks_ran_flags;
 u64 expected_callback = 0;
-s64 lateness = 0;
 
 template <unsigned int IDX>
-void CallbackTemplate(u64 userdata, s64 cycles_late) {
+void HostCallbackTemplate(u64 userdata, s64 nanoseconds_late) {
     static_assert(IDX < CB_IDS.size(), "IDX out of range");
     callbacks_ran_flags.set(IDX);
     REQUIRE(CB_IDS[IDX] == userdata);
-    REQUIRE(CB_IDS[IDX] == expected_callback);
-    REQUIRE(lateness == cycles_late);
-}
-
-u64 callbacks_done = 0;
-
-void EmptyCallback(u64 userdata, s64 cycles_late) {
-    ++callbacks_done;
+    REQUIRE(CB_IDS[IDX] == CB_IDS[calls_order[expected_callback]]);
+    delays[IDX] = nanoseconds_late;
+    ++expected_callback;
 }
 
 struct ScopeInit final {
     ScopeInit() {
-        core_timing.Initialize();
+        core_timing.SetMulticore(true);
+        core_timing.Initialize([]() {});
     }
     ~ScopeInit() {
         core_timing.Shutdown();
@@ -49,110 +46,101 @@ struct ScopeInit final {
     Core::Timing::CoreTiming core_timing;
 };
 
-void AdvanceAndCheck(Core::Timing::CoreTiming& core_timing, u32 idx, u32 context = 0,
-                     int expected_lateness = 0, int cpu_downcount = 0) {
-    callbacks_ran_flags = 0;
-    expected_callback = CB_IDS[idx];
-    lateness = expected_lateness;
-
-    // Pretend we executed X cycles of instructions.
-    core_timing.SwitchContext(context);
-    core_timing.AddTicks(core_timing.GetDowncount() - cpu_downcount);
-    core_timing.Advance();
-    core_timing.SwitchContext((context + 1) % 4);
+#pragma optimize("", off)
 
-    REQUIRE(decltype(callbacks_ran_flags)().set(idx) == callbacks_ran_flags);
+u64 TestTimerSpeed(Core::Timing::CoreTiming& core_timing) {
+    u64 start = core_timing.GetGlobalTimeNs().count();
+    u64 placebo = 0;
+    for (std::size_t i = 0; i < 1000; i++) {
+        placebo += core_timing.GetGlobalTimeNs().count();
+    }
+    u64 end = core_timing.GetGlobalTimeNs().count();
+    return (end - start);
 }
+
+#pragma optimize("", on)
+
 } // Anonymous namespace
 
 TEST_CASE("CoreTiming[BasicOrder]", "[core]") {
     ScopeInit guard;
     auto& core_timing = guard.core_timing;
+    std::vector<std::shared_ptr<Core::Timing::EventType>> events{
+        Core::Timing::CreateEvent("callbackA", HostCallbackTemplate<0>),
+        Core::Timing::CreateEvent("callbackB", HostCallbackTemplate<1>),
+        Core::Timing::CreateEvent("callbackC", HostCallbackTemplate<2>),
+        Core::Timing::CreateEvent("callbackD", HostCallbackTemplate<3>),
+        Core::Timing::CreateEvent("callbackE", HostCallbackTemplate<4>),
+    };
+
+    expected_callback = 0;
+
+    core_timing.SyncPause(true);
+
+    u64 one_micro = 1000U;
+    for (std::size_t i = 0; i < events.size(); i++) {
+        u64 order = calls_order[i];
+        core_timing.ScheduleEvent(i * one_micro + 100U, events[order], CB_IDS[order]);
+    }
+    /// test pause
+    REQUIRE(callbacks_ran_flags.none());
 
-    std::shared_ptr<Core::Timing::EventType> cb_a =
-        Core::Timing::CreateEvent("callbackA", CallbackTemplate<0>);
-    std::shared_ptr<Core::Timing::EventType> cb_b =
-        Core::Timing::CreateEvent("callbackB", CallbackTemplate<1>);
-    std::shared_ptr<Core::Timing::EventType> cb_c =
-        Core::Timing::CreateEvent("callbackC", CallbackTemplate<2>);
-    std::shared_ptr<Core::Timing::EventType> cb_d =
-        Core::Timing::CreateEvent("callbackD", CallbackTemplate<3>);
-    std::shared_ptr<Core::Timing::EventType> cb_e =
-        Core::Timing::CreateEvent("callbackE", CallbackTemplate<4>);
-
-    // Enter slice 0
-    core_timing.ResetRun();
-
-    // D -> B -> C -> A -> E
-    core_timing.SwitchContext(0);
-    core_timing.ScheduleEvent(1000, cb_a, CB_IDS[0]);
-    REQUIRE(1000 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(500, cb_b, CB_IDS[1]);
-    REQUIRE(500 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(800, cb_c, CB_IDS[2]);
-    REQUIRE(500 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(100, cb_d, CB_IDS[3]);
-    REQUIRE(100 == core_timing.GetDowncount());
-    core_timing.ScheduleEvent(1200, cb_e, CB_IDS[4]);
-    REQUIRE(100 == core_timing.GetDowncount());
-
-    AdvanceAndCheck(core_timing, 3, 0);
-    AdvanceAndCheck(core_timing, 1, 1);
-    AdvanceAndCheck(core_timing, 2, 2);
-    AdvanceAndCheck(core_timing, 0, 3);
-    AdvanceAndCheck(core_timing, 4, 0);
-}
-
-TEST_CASE("CoreTiming[FairSharing]", "[core]") {
+    core_timing.Pause(false); // No need to sync
 
-    ScopeInit guard;
-    auto& core_timing = guard.core_timing;
+    while (core_timing.HasPendingEvents())
+        ;
 
-    std::shared_ptr<Core::Timing::EventType> empty_callback =
-        Core::Timing::CreateEvent("empty_callback", EmptyCallback);
+    REQUIRE(callbacks_ran_flags.all());
 
-    callbacks_done = 0;
-    u64 MAX_CALLBACKS = 10;
-    for (std::size_t i = 0; i < 10; i++) {
-        core_timing.ScheduleEvent(i * 3333U, empty_callback, 0);
+    for (std::size_t i = 0; i < delays.size(); i++) {
+        const double delay = static_cast<double>(delays[i]);
+        const double micro = delay / 1000.0f;
+        const double mili = micro / 1000.0f;
+        printf("HostTimer Pausing Delay[%zu]: %.3f %.6f\n", i, micro, mili);
     }
-
-    const s64 advances = MAX_SLICE_LENGTH / 10;
-    core_timing.ResetRun();
-    u64 current_time = core_timing.GetTicks();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (u32 active_core = 0; active_core < 4; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                core_timing.AddTicks(std::min<s64>(advances, core_timing.GetDowncount()));
-                core_timing.Advance();
-            }
-            keep_running |= core_timing.CanCurrentContextRun();
-        }
-    } while (keep_running);
-    u64 current_time_2 = core_timing.GetTicks();
-
-    REQUIRE(MAX_CALLBACKS == callbacks_done);
-    REQUIRE(current_time_2 == current_time + MAX_SLICE_LENGTH * 4);
 }
 
-TEST_CASE("Core::Timing[PredictableLateness]", "[core]") {
+TEST_CASE("CoreTiming[BasicOrderNoPausing]", "[core]") {
     ScopeInit guard;
     auto& core_timing = guard.core_timing;
+    std::vector<std::shared_ptr<Core::Timing::EventType>> events{
+        Core::Timing::CreateEvent("callbackA", HostCallbackTemplate<0>),
+        Core::Timing::CreateEvent("callbackB", HostCallbackTemplate<1>),
+        Core::Timing::CreateEvent("callbackC", HostCallbackTemplate<2>),
+        Core::Timing::CreateEvent("callbackD", HostCallbackTemplate<3>),
+        Core::Timing::CreateEvent("callbackE", HostCallbackTemplate<4>),
+    };
+
+    core_timing.SyncPause(true);
+    core_timing.SyncPause(false);
+
+    expected_callback = 0;
+
+    u64 start = core_timing.GetGlobalTimeNs().count();
+    u64 one_micro = 1000U;
+    for (std::size_t i = 0; i < events.size(); i++) {
+        u64 order = calls_order[i];
+        core_timing.ScheduleEvent(i * one_micro + 100U, events[order], CB_IDS[order]);
+    }
+    u64 end = core_timing.GetGlobalTimeNs().count();
+    const double scheduling_time = static_cast<double>(end - start);
+    const double timer_time = static_cast<double>(TestTimerSpeed(core_timing));
 
-    std::shared_ptr<Core::Timing::EventType> cb_a =
-        Core::Timing::CreateEvent("callbackA", CallbackTemplate<0>);
-    std::shared_ptr<Core::Timing::EventType> cb_b =
-        Core::Timing::CreateEvent("callbackB", CallbackTemplate<1>);
+    while (core_timing.HasPendingEvents())
+        ;
 
-    // Enter slice 0
-    core_timing.ResetRun();
+    REQUIRE(callbacks_ran_flags.all());
 
-    core_timing.ScheduleEvent(100, cb_a, CB_IDS[0]);
-    core_timing.ScheduleEvent(200, cb_b, CB_IDS[1]);
+    for (std::size_t i = 0; i < delays.size(); i++) {
+        const double delay = static_cast<double>(delays[i]);
+        const double micro = delay / 1000.0f;
+        const double mili = micro / 1000.0f;
+        printf("HostTimer No Pausing Delay[%zu]: %.3f %.6f\n", i, micro, mili);
+    }
 
-    AdvanceAndCheck(core_timing, 0, 0, 10, -10); // (100 - 10)
-    AdvanceAndCheck(core_timing, 1, 1, 50, -50);
+    const double micro = scheduling_time / 1000.0f;
+    const double mili = micro / 1000.0f;
+    printf("HostTimer No Pausing Scheduling Time: %.3f %.6f\n", micro, mili);
+    printf("HostTimer No Pausing Timer Time: %.3f %.6f\n", timer_time / 1000.f,
+           timer_time / 1000000.f);
 }
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d6ee82836..21c46a567 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    compatible_formats.cpp
+    compatible_formats.h
     dirty_flags.cpp
     dirty_flags.h
     dma_pusher.cpp
@@ -25,6 +27,14 @@ add_library(video_core STATIC
     engines/shader_bytecode.h
     engines/shader_header.h
     engines/shader_type.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
     fence_manager.h
     gpu.cpp
     gpu.h
@@ -36,8 +46,6 @@ add_library(video_core STATIC
     gpu_thread.h
     guest_driver.cpp
     guest_driver.h
-    macro_interpreter.cpp
-    macro_interpreter.h
     memory_manager.cpp
     memory_manager.h
     morton.cpp
@@ -45,11 +53,11 @@ add_library(video_core STATIC
     query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
-    rasterizer_cache.cpp
-    rasterizer_cache.h
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
     renderer_opengl/gl_device.cpp
@@ -89,6 +97,7 @@ add_library(video_core STATIC
     renderer_opengl/utils.h
     sampler_cache.cpp
     sampler_cache.h
+    shader_cache.h
     shader/decode/arithmetic.cpp
     shader/decode/arithmetic_immediate.cpp
     shader/decode/bfe.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
 
 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
         return cpu_addr <= other_start && other_end <= cpu_addr_end;
     }
 
-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
         return static_cast<std::size_t>(in_addr - cpu_addr);
     }
 
-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
         return cpu_addr;
     }
 
-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
         return cpu_addr_end;
     }
 
-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
         cpu_addr = new_addr;
         cpu_addr_end = new_addr + size;
     }
 
-    std::size_t GetSize() const {
+    std::size_t Size() const {
         return size;
     }
 
-    void SetEpoch(u64 new_epoch) {
-        epoch = new_epoch;
+    u64 Epoch() const {
+        return epoch;
     }
 
-    u64 GetEpoch() {
-        return epoch;
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
     }
 
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
     }
-    ~BufferBlock() = default;
 
 private:
     VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index d9a4a1b4d..dd7ce8c99 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -30,23 +30,31 @@
 
 namespace VideoCommon {
 
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
     using IntervalSet = boost::icl::interval_set<VAddr>;
     using IntervalType = typename IntervalSet::interval_type;
     using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
 
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
-    using BufferInfo = std::pair<BufferType, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        const auto& memory_manager = system.GPU().MemoryManager();
+        auto& memory_manager = system.GPU().MemoryManager();
         const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         const VAddr cpu_addr = *cpu_addr_opt;
 
@@ -55,37 +63,41 @@ public:
         constexpr std::size_t max_stream_size = 0x800;
         if (use_fast_cbuf || size < max_stream_size) {
             if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                auto& memory_manager = system.GPU().MemoryManager();
+                const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
                 if (use_fast_cbuf) {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return ConstBufferUpload(host_ptr, size);
+                    u8* dest;
+                    if (is_granular) {
+                        dest = memory_manager.GetPointer(gpu_addr);
                     } else {
                         staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return ConstBufferUpload(staging_buffer.data(), size);
+                        dest = staging_buffer.data();
+                        memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
                     }
+                    return ConstBufferUpload(dest, size);
+                }
+                if (is_granular) {
+                    u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
+                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+                        std::memcpy(dest, host_ptr, size);
+                    });
                 } else {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return StreamBufferUpload(host_ptr, size, alignment);
-                    } else {
-                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return StreamBufferUpload(staging_buffer.data(), size, alignment);
-                    }
+                    return StreamBufferUpload(
+                        size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
+                            memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
+                        });
                 }
             }
         }
 
-        OwnerBuffer block = GetBlock(cpu_addr, size);
+        Buffer* const block = GetBlock(cpu_addr, size);
         MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
         if (!map) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
-            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
                 MarkForAsyncFlush(map);
             }
             if (!map->is_written) {
@@ -94,41 +106,49 @@ public:
             }
         }
 
-        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
     BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
                                 std::size_t alignment = 4) {
         std::lock_guard lock{mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
+        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+            std::memcpy(dest, raw_pointer, size);
+        });
     }
 
-    void Map(std::size_t max_size) {
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    bool Map(std::size_t max_size) {
         std::lock_guard lock{mutex};
 
+        bool invalidated;
         std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
         buffer_offset = buffer_offset_base;
+
+        return invalidated;
     }
 
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
+    /// Finishes the upload stream
+    void Unmap() {
         std::lock_guard lock{mutex};
-
         stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
     }
 
+    /// Function called at the end of each frame, inteded for deferred operations
     void TickFrame() {
         ++epoch;
+
         while (!pending_destruction.empty()) {
             // Delay at least 4 frames before destruction.
             // This is due to triple buffering happening on some drivers.
             static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                 break;
             }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
         }
     }
 
@@ -239,28 +259,16 @@ public:
         committed_flushes.pop_front();
     }
 
-    virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
 protected:
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
                          std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
 
     ~BufferCache() = default;
 
-    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
-
-    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
-    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
-
-    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
-
-    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
         return {};
@@ -315,19 +323,18 @@ protected:
     }
 
 private:
-    MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
-                            std::size_t size) {
+    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
         const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
             auto& memory_manager = system.GPU().MemoryManager();
             const VAddr cpu_addr_end = cpu_addr + size;
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
@@ -363,15 +370,15 @@ private:
         }
         if (modified_inheritance) {
             map->MarkAsModified(true, GetModifiedTicks());
-            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
                 MarkForAsyncFlush(map);
             }
         }
         return map;
     }
 
-    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
-                     const VectorMapInterval& overlaps) {
+    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
         interval_set.add(base_interval);
@@ -380,13 +387,13 @@ private:
             interval_set.subtract(subtract);
         }
         for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
             }
+            staging_buffer.resize(size);
+            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
@@ -416,23 +423,27 @@ private:
     }
 
     void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
         const std::size_t size = map->end - map->start;
-        OwnerBuffer block = blocks[map->start >> block_page_bits];
         staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
         system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
 
-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
+    template <typename Callable>
+    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
         AlignBuffer(alignment);
         const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
+        callable(buffer_ptr);
 
         buffer_ptr += size;
         buffer_offset += size;
-        return {stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
     }
 
     void AlignBuffer(std::size_t alignment) {
@@ -442,97 +453,89 @@ private:
         buffer_offset = offset_aligned;
     }
 
-    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
+
         return new_buffer;
     }
 
-    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
         const VAddr new_addr = std::min(first_addr, second_addr);
         const std::size_t new_size = size_1 + size_2;
-        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
         const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
         return new_buffer;
     }
 
-    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        OwnerBuffer found;
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
         const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
             auto it = blocks.find(page_start);
             if (it == blocks.end()) {
                 if (found) {
                     found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                 }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
             }
-            ++page_start;
         }
-        return found;
+        return found.get();
     }
 
-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 it->second = it->second + 1;
             } else {
-                written_pages[page_start] = 1;
+                written_pages.insert_or_assign(page_start, 1);
             }
-            ++page_start;
         }
     }
 
-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 if (it->second > 1) {
@@ -541,22 +544,24 @@ private:
                     written_pages.erase(it);
                 }
             }
-            ++page_start;
         }
     }
 
-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             if (written_pages.count(page_start) > 0) {
                 return true;
             }
-            ++page_start;
         }
         return false;
     }
 
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
     void MarkForAsyncFlush(MapInterval* map) {
         if (!uncommitted_flushes) {
             uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
@@ -568,9 +573,7 @@ private:
     Core::System& system;
 
     std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
+    BufferType stream_buffer_handle;
 
     u8* buffer_ptr = nullptr;
     u64 buffer_offset = 0;
@@ -580,18 +583,15 @@ private:
     boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
         mapped_addresses;
 
-    static constexpr u64 write_page_bit = 11;
     std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
 
-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, OwnerBuffer> blocks;
-
-    std::list<OwnerBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
     u64 epoch = 0;
     u64 modified_ticks = 0;
 
     std::vector<u8> staging_buffer;
+
     std::list<MapInterval*> marked_for_unregister;
 
     std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..6c426b035
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,162 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+    PixelFormat::RGBA32F,
+    PixelFormat::RGBA32UI,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+    PixelFormat::RGB32F,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S,
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+    PixelFormat::RG16F,        PixelFormat::R11FG11FB10F, PixelFormat::R32F,
+    PixelFormat::A2B10G10R10U, PixelFormat::RG16UI,       PixelFormat::R32UI,
+    PixelFormat::RG16I,        PixelFormat::R32I,         PixelFormat::ABGR8U,
+    PixelFormat::RG16,         PixelFormat::ABGR8S,       PixelFormat::RG16S,
+    PixelFormat::RGBA8_SRGB,   PixelFormat::E5B9G9R9F,    PixelFormat::BGRA8,
+    PixelFormat::BGRA8_SRGB,
+};
+// Missing formats:
+// PixelFormat::RGBA8UI
+// PixelFormat::RGBA8I
+// PixelFormat::RGB10_A2_UI
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+    PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I,
+    PixelFormat::RG8U, PixelFormat::R16U,  PixelFormat::RG8S,  PixelFormat::R16S,
+};
+// Missing formats:
+// PixelFormat::RG8I
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+    PixelFormat::R8UI,
+    PixelFormat::R8U,
+};
+// Missing formats:
+// PixelFormat::R8I
+// PixelFormat::R8S
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+    PixelFormat::DXN1,
+};
+// Missing formats:
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+    PixelFormat::DXN2UNORM,
+    PixelFormat::DXN2SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+    PixelFormat::BC7U,
+    PixelFormat::BC7U_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+    PixelFormat::BC6H_SF16,
+    PixelFormat::BC6H_UF16,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+    PixelFormat::RGBA32UI,   PixelFormat::RGBA32F,   PixelFormat::DXT23,
+    PixelFormat::DXT23_SRGB, PixelFormat::DXT45,     PixelFormat::DXT45_SRGB,
+    PixelFormat::DXN2SNORM,  PixelFormat::BC7U,      PixelFormat::BC7U_SRGB,
+    PixelFormat::BC6H_SF16,  PixelFormat::BC6H_UF16,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI,  PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1,
+
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I,
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+    compatiblity[format_a][format_b] = true;
+    compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(compatibility, *it_a, *it_b);
+        }
+    }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+    copy = view;
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..d1082566d
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+    explicit FormatCompatibility();
+
+    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+private:
+    Table view;
+    Table copy;
+};
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
     virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
     virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                     u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
     virtual u32 GetBoundBuffer() const = 0;
 
     virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index f6237fc6a..a82b06a38 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
     ASSERT(stage == ShaderType::Compute);
     const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 18ceedfaf..b7f668d88 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 004f6b261..c01436295 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
     : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+      macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
     dirty.flags.flip();
-
     InitializeRegisterDefaults();
 }
 
@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.rasterize_enable = 1;
     regs.rt_separate_frag_data = 1;
     regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
     regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
 
     shadow_state = regs;
 
@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
     mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }
 
-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
     // Reset the current macro.
     executing_macro = 0;
 
@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
 
         // Call the macro when there are no more parameters in the command buffer
         if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
             macro_params.clear();
         }
         return;
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
         break;
     }
     case MAXWELL3D_REG_INDEX(macros.data): {
-        ProcessMacroUpload(arg);
+        macro_engine->AddCode(regs.macros.upload_address, arg);
         break;
     }
     case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
 
         // Call the macro when there are no more parameters in the command buffer
         if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
             macro_params.clear();
         }
         return;
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 }
 
 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
-               "upload_address exceeded macro_memory size!");
-    macro_memory[regs.macros.upload_address++] = data;
+    macro_engine->AddCode(regs.macros.upload_address++, data);
 }
 
 void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -739,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
     const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& tex_info_buffer = shader.const_buffers[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 05dd6b39b..ef1618990 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
 #include "video_core/textures/texture.h"
 
 namespace Core {
@@ -598,6 +598,7 @@ public:
                 BitField<4, 3, u32> block_height;
                 BitField<8, 3, u32> block_depth;
                 BitField<12, 1, InvMemoryLayout> type;
+                BitField<16, 1, u32> is_3d;
             } memory_layout;
             union {
                 BitField<0, 16, u32> layers;
@@ -1403,6 +1404,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
@@ -1411,17 +1414,16 @@ public:
 
     const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
 
-    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
-    /// we've seen used.
-    using MacroMemory = std::array<u32, 0x40000>;
+    bool ShouldExecute() const {
+        return execute_on;
+    }
 
-    /// Gets a reference to macro memory.
-    const MacroMemory& GetMacroMemory() const {
-        return macro_memory;
+    VideoCore::RasterizerInterface& GetRasterizer() {
+        return rasterizer;
     }
 
-    bool ShouldExecute() const {
-        return execute_on;
+    const VideoCore::RasterizerInterface& GetRasterizer() const {
+        return rasterizer;
     }
 
     /// Notify a memory write has happened.
@@ -1468,16 +1470,13 @@ private:
 
     std::array<bool, Regs::NUM_REGS> mme_inline{};
 
-    /// Memory for macro code
-    MacroMemory macro_memory;
-
     /// Macro method that is currently being executed / being fed parameters.
     u32 executing_macro = 0;
     /// Parameters that have been submitted to the macro call so far.
     std::vector<u32> macro_params;
 
     /// Interpreter for the macro codes uploaded to the GPU.
-    MacroInterpreter macro_interpreter;
+    std::unique_ptr<MacroEngine> macro_engine;
 
     static constexpr u32 null_cb_data = 0xFFFFFFFF;
     struct {
@@ -1506,7 +1505,7 @@ private:
      * @param num_parameters Number of arguments
      * @param parameters Arguments to the method call
      */
-    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
 
     /// Handles writes to the macro uploading register.
     void ProcessMacroUpload(u32 data);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e7cb87589..d374b73cf 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -661,6 +661,10 @@ union Instruction {
     constexpr Instruction(u64 value) : value{value} {}
     constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
+    constexpr bool Bit(u64 offset) const {
+        return ((value >> offset) & 1) != 0;
+    }
+
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
     union {
@@ -1874,7 +1878,9 @@ public:
         HSETP2_C,
         HSETP2_R,
         HSETP2_IMM,
+        HSET2_C,
         HSET2_R,
+        HSET2_IMM,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -2194,7 +2200,9 @@ private:
             INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8eb017f65..758bfe148 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
+
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
@@ -154,9 +156,8 @@ u64 GPU::GetTicks() const {
     constexpr u64 gpu_ticks_num = 384;
     constexpr u64 gpu_ticks_den = 625;
 
-    const u64 cpu_ticks = system.CoreTiming().GetTicks();
-    u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
-    if (Settings::values.use_fast_gpu_time) {
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+    if (Settings::values.use_fast_gpu_time.GetValue()) {
         nanoseconds /= 256;
     }
     const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a1b4c305c..2c42483bd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -284,6 +284,12 @@ public:
     /// core timing events.
     virtual void Start() = 0;
 
+    /// Obtain the CPU Context
+    virtual void ObtainContext() = 0;
+
+    /// Release the CPU Context
+    virtual void ReleaseContext() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 53305ab43..7b855f63e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    cpu_context->MakeCurrent();
     gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
 }
 
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 517658612..15e9f1d38 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -25,6 +25,8 @@ public:
     ~GPUAsynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..aaeb9811d 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase
 
 GPUSynch::~GPUSynch() = default;
 
-void GPUSynch::Start() {
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
     context->MakeCurrent();
 }
 
+void GPUSynch::ReleaseContext() {
+    context->DoneCurrent();
+}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..762c20aa5 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -24,6 +24,8 @@ public:
     ~GPUSynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c3bb4fe06..738c6f0c1 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/settings.h"
@@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread {
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-    MicroProfileOnThreadCreate("GpuThread");
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
     while (state.queue.Empty())
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..a50e7b4e0
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <optional>
+#include <boost/container_hash/hash.hpp>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        std::optional<u32> mid_method = std::nullopt;
+        const auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..4d00b84b0
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,141 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class HLEMacro;
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     * @param code The macro byte code to execute
+     * @param parameters The parameters of the macro
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..410f99018
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+                                                                        ~(0x3ffffff << 26)));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 947364928..aa5256419 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -6,109 +6,47 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"
 
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-namespace {
-enum class Operation : u32 {
-    ALU = 0,
-    AddImmediate = 1,
-    ExtractInsert = 2,
-    ExtractShiftLeftImmediate = 3,
-    ExtractShiftLeftRegister = 4,
-    Read = 5,
-    Unused = 6, // This operation doesn't seem to be a valid encoding.
-    Branch = 7,
-};
-} // Anonymous namespace
-
-enum class MacroInterpreter::ALUOperation : u32 {
-    Add = 0,
-    AddWithCarry = 1,
-    Subtract = 2,
-    SubtractWithBorrow = 3,
-    // Operations 4-7 don't seem to be valid encodings.
-    Xor = 8,
-    Or = 9,
-    And = 10,
-    AndNot = 11,
-    Nand = 12
-};
-
-enum class MacroInterpreter::ResultOperation : u32 {
-    IgnoreAndFetch = 0,
-    Move = 1,
-    MoveAndSetMethod = 2,
-    FetchAndSend = 3,
-    MoveAndSend = 4,
-    FetchAndSetMethod = 5,
-    MoveAndSetMethodFetchAndSend = 6,
-    MoveAndSetMethodSend = 7
-};
-
-enum class MacroInterpreter::BranchCondition : u32 {
-    Zero = 0,
-    NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
-    u32 raw;
-    BitField<0, 3, Operation> operation;
-    BitField<4, 3, ResultOperation> result_operation;
-    BitField<4, 1, BranchCondition> branch_condition;
-    // If set on a branch, then the branch doesn't have a delay slot.
-    BitField<5, 1, u32> branch_annul;
-    BitField<7, 1, u32> is_exit;
-    BitField<8, 3, u32> dst;
-    BitField<11, 3, u32> src_a;
-    BitField<14, 3, u32> src_b;
-    // The signed immediate overlaps the second source operand and the alu operation.
-    BitField<14, 18, s32> immediate;
-
-    BitField<17, 5, ALUOperation> alu_operation;
-
-    // Bitfield instructions data
-    BitField<17, 5, u32> bf_src_bit;
-    BitField<22, 5, u32> bf_size;
-    BitField<27, 5, u32> bf_dst_bit;
-
-    u32 GetBitfieldMask() const {
-        return (1 << bf_size) - 1;
-    }
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
-    s32 GetBranchTarget() const {
-        return static_cast<s32>(immediate * sizeof(u32));
-    }
-};
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
 
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+                                           const std::vector<u32>& code)
+    : maxwell3d(maxwell3d), code(code) {}
 
-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
     MICROPROFILE_SCOPE(MacroInterp);
     Reset();
 
     registers[1] = parameters[0];
+    num_parameters = parameters.size();
 
     if (num_parameters > parameters_capacity) {
         parameters_capacity = num_parameters;
         this->parameters = std::make_unique<u32[]>(num_parameters);
     }
-    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
+    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
     this->num_parameters = num_parameters;
 
     // Execute the code until we hit an exit condition.
     bool keep_executing = true;
     while (keep_executing) {
-        keep_executing = Step(offset, false);
+        keep_executing = Step(false);
     }
 
     // Assert the the macro used all the input parameters
     ASSERT(next_parameter_index == num_parameters);
 }
 
-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
     registers = {};
     pc = 0;
     delayed_pc = {};
@@ -120,10 +58,10 @@ void MacroInterpreter::Reset() {
     carry_flag = false;
 }
 
-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
     u32 base_address = pc;
 
-    Opcode opcode = GetOpcode(offset);
+    Macro::Opcode opcode = GetOpcode();
     pc += 4;
 
     // Update the program counter if we were delayed
@@ -134,18 +72,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     }
 
     switch (opcode.operation) {
-    case Operation::ALU: {
+    case Macro::Operation::ALU: {
         u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
                                   GetRegister(opcode.src_b));
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::AddImmediate: {
+    case Macro::Operation::AddImmediate: {
         ProcessResult(opcode.result_operation, opcode.dst,
                       GetRegister(opcode.src_a) + opcode.immediate);
         break;
     }
-    case Operation::ExtractInsert: {
+    case Macro::Operation::ExtractInsert: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -155,7 +93,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, dst);
         break;
     }
-    case Operation::ExtractShiftLeftImmediate: {
+    case Macro::Operation::ExtractShiftLeftImmediate: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -164,7 +102,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::ExtractShiftLeftRegister: {
+    case Macro::Operation::ExtractShiftLeftRegister: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -173,12 +111,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Read: {
+    case Macro::Operation::Read: {
         u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Branch: {
+    case Macro::Operation::Branch: {
         ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
         u32 value = GetRegister(opcode.src_a);
         bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +129,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
 
             delayed_pc = base_address + opcode.GetBranchTarget();
             // Execute one more instruction due to the delay slot.
-            return Step(offset, true);
+            return Step(true);
         }
         break;
     }
@@ -204,51 +142,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     // cause an exit if it's executed inside a delay slot.
     if (opcode.is_exit && !is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
-        Step(offset, true);
+        Step(true);
         return false;
     }
 
     return true;
 }
 
-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
-    const auto& macro_memory{maxwell3d.GetMacroMemory()};
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
-    return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
     switch (operation) {
-    case ALUOperation::Add: {
+    case Macro::ALUOperation::Add: {
         const u64 result{static_cast<u64>(src_a) + src_b};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::AddWithCarry: {
+    case Macro::ALUOperation::AddWithCarry: {
         const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Subtract: {
+    case Macro::ALUOperation::Subtract: {
         const u64 result{static_cast<u64>(src_a) - src_b};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::SubtractWithBorrow: {
+    case Macro::ALUOperation::SubtractWithBorrow: {
         const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Xor:
+    case Macro::ALUOperation::Xor:
         return src_a ^ src_b;
-    case ALUOperation::Or:
+    case Macro::ALUOperation::Or:
         return src_a | src_b;
-    case ALUOperation::And:
+    case Macro::ALUOperation::And:
         return src_a & src_b;
-    case ALUOperation::AndNot:
+    case Macro::ALUOperation::AndNot:
         return src_a & ~src_b;
-    case ALUOperation::Nand:
+    case Macro::ALUOperation::Nand:
         return ~(src_a & src_b);
 
     default:
@@ -257,43 +188,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
     }
 }
 
-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
     switch (operation) {
-    case ResultOperation::IgnoreAndFetch:
+    case Macro::ResultOperation::IgnoreAndFetch:
         // Fetch parameter and ignore result.
         SetRegister(reg, FetchParameter());
         break;
-    case ResultOperation::Move:
+    case Macro::ResultOperation::Move:
         // Move result.
         SetRegister(reg, result);
         break;
-    case ResultOperation::MoveAndSetMethod:
+    case Macro::ResultOperation::MoveAndSetMethod:
         // Move result and use as Method Address.
         SetRegister(reg, result);
         SetMethodAddress(result);
         break;
-    case ResultOperation::FetchAndSend:
+    case Macro::ResultOperation::FetchAndSend:
         // Fetch parameter and send result.
         SetRegister(reg, FetchParameter());
         Send(result);
         break;
-    case ResultOperation::MoveAndSend:
+    case Macro::ResultOperation::MoveAndSend:
         // Move and send result.
         SetRegister(reg, result);
         Send(result);
         break;
-    case ResultOperation::FetchAndSetMethod:
+    case Macro::ResultOperation::FetchAndSetMethod:
         // Fetch parameter and use result as Method Address.
         SetRegister(reg, FetchParameter());
         SetMethodAddress(result);
         break;
-    case ResultOperation::MoveAndSetMethodFetchAndSend:
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
         // Move result and use as Method Address, then fetch and send parameter.
         SetRegister(reg, result);
         SetMethodAddress(result);
         Send(FetchParameter());
         break;
-    case ResultOperation::MoveAndSetMethodSend:
+    case Macro::ResultOperation::MoveAndSetMethodSend:
         // Move result and use as Method Address, then send bits 12:17 of result.
         SetRegister(reg, result);
         SetMethodAddress(result);
@@ -304,16 +235,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
     }
 }
 
-u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
+}
+
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
 }
 
-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
     return registers.at(register_id);
 }
 
-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
     // Register 0 is hardwired as the zero register.
     // Ensure no writes to it actually occur.
     if (register_id == 0) {
@@ -323,30 +266,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
     registers.at(register_id) = value;
 }
 
-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
     method_address.raw = address;
 }
 
-void MacroInterpreter::Send(u32 value) {
+void MacroInterpreterImpl::Send(u32 value) {
     maxwell3d.CallMethodFromMME(method_address.address, value);
     // Increment the method address by the method increment.
     method_address.address.Assign(method_address.address.Value() +
                                   method_address.increment.Value());
 }
 
-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
     return maxwell3d.GetRegisterValue(method);
 }
 
-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case BranchCondition::Zero:
-        return value == 0;
-    case BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-    return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
 }
 
 } // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
index 631146d89..90217fc89 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
-
 #include <array>
 #include <optional>
-
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/macro/macro.h"
 
 namespace Tegra {
 namespace Engines {
 class Maxwell3D;
 }
 
-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
 public:
     explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
 
-    /**
-     * Executes the macro code with the specified input parameters.
-     * @param offset Offset to start execution at.
-     * @param parameters The parameters of the macro.
-     */
-    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
 
 private:
-    enum class ALUOperation : u32;
-    enum class BranchCondition : u32;
-    enum class ResultOperation : u32;
-
-    union Opcode;
+    Engines::Maxwell3D& maxwell3d;
+};
 
-    union MethodAddress {
-        u32 raw;
-        BitField<0, 12, u32> address;
-        BitField<12, 6, u32> increment;
-    };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
 
+private:
     /// Resets the execution engine state, zeroing registers, etc.
     void Reset();
 
@@ -49,20 +42,20 @@ private:
      * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
      * previous instruction.
      */
-    bool Step(u32 offset, bool is_delay_slot);
+    bool Step(bool is_delay_slot);
 
     /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
 
     /// Performs the result operation on the input result and stores it in the specified register
     /// (if necessary).
-    void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
 
     /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
 
     /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(u32 offset) const;
+    Macro::Opcode GetOpcode() const;
 
     /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
     u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
     /// Program counter to execute at after the delay slot is executed.
     std::optional<u32> delayed_pc;
 
-    static constexpr std::size_t NumMacroRegisters = 8;
-
     /// General purpose macro registers.
-    std::array<u32, NumMacroRegisters> registers = {};
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
 
     /// Method address to use for the next Send instruction.
-    MethodAddress method_address = {};
+    Macro::MethodAddress method_address = {};
 
     /// Input parameters of the current macro.
     std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
     u32 next_parameter_index = 0;
 
     bool carry_flag = false;
+    const std::vector<u32>& code;
 };
+
 } // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..07292702f
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,621 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
+static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    STATE,
+    RESULT,
+    PARAMETERS,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    program(&state, parameters.data());
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
+
+    Xbyak::Reg32 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, eax);
+        }
+    }
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, dst.cvt8());
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, dst.cvt8());
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    bool keep_executing = true;
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg == 0) {
+            return;
+        }
+        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
+    };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..a180e7428
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,98 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index dbee9f634..ff5505d12 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si
     return range == inner_size;
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer,
+                              const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
     }
 }
 
-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                     const std::size_t size) const {
     std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                               const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
     }
 }
 
-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
                                      const std::size_t size) {
     std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
 
     auto& memory = system.Memory();
 
@@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                              const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlock(src_addr, tmp_buffer.data(), size);
-    WriteBlock(dest_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    const std::size_t size) {
     std::vector<u8> tmp_buffer(size);
-    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
-    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }
 
 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 0ddd52d5a..87658e87a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -79,9 +79,9 @@ public:
      * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
      * Flushes and Invalidations, respectively to each operation.
      */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,9 +93,9 @@ public:
      * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
      * being flushed.
      */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
 
     /**
      * IsGranularRange checks if a gpu region can be simply read with a pointer
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 2f75f8801..0d3a88765 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -132,7 +132,7 @@ public:
         }
 
         query->BindCounter(Stream(type).Current(), timestamp);
-        if (Settings::values.use_asynchronous_gpu_emulation) {
+        if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
             AsyncFlushQuery(cpu_addr);
         }
     }
@@ -220,8 +220,8 @@ private:
             return cache_begin < addr_end && addr_begin < cache_end;
         };
 
-        const u64 page_end = addr_end >> PAGE_SHIFT;
-        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
             const auto& it = cached_queries.find(page);
             if (it == std::end(cached_queries)) {
                 continue;
@@ -242,14 +242,14 @@ private:
     /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
     CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
         rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
-        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
         return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
                                                   host_ptr);
     }
 
     /// Tries to a get a cached query. Returns nullptr on failure.
     CachedQuery* TryGet(VAddr addr) {
-        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
         const auto it = cached_queries.find(page);
         if (it == std::end(cached_queries)) {
             return nullptr;
@@ -268,7 +268,7 @@ private:
     }
 
     static constexpr std::uintptr_t PAGE_SIZE = 4096;
-    static constexpr unsigned PAGE_SHIFT = 12;
+    static constexpr unsigned PAGE_BITS = 12;
 
     Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp
deleted file mode 100644
index 093b2cdf4..000000000
--- a/src/video_core/rasterizer_cache.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/rasterizer_cache.h"
-
-RasterizerCacheObject::~RasterizerCacheObject() = default;
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
deleted file mode 100644
index 096ee337c..000000000
--- a/src/video_core/rasterizer_cache.h
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-
-#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range_core.hpp>
-
-#include "common/common_types.h"
-#include "core/settings.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
-
-class RasterizerCacheObject {
-public:
-    explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
-
-    virtual ~RasterizerCacheObject();
-
-    VAddr GetCpuAddr() const {
-        return cpu_addr;
-    }
-
-    /// Gets the size of the shader in guest memory, required for cache management
-    virtual std::size_t GetSizeInBytes() const = 0;
-
-    /// Sets whether the cached object should be considered registered
-    void SetIsRegistered(bool registered) {
-        is_registered = registered;
-    }
-
-    /// Returns true if the cached object is registered
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    /// Returns true if the cached object is dirty
-    bool IsDirty() const {
-        return is_dirty;
-    }
-
-    /// Returns ticks from when this cached object was last modified
-    u64 GetLastModifiedTicks() const {
-        return last_modified_ticks;
-    }
-
-    /// Marks an object as recently modified, used to specify whether it is clean or dirty
-    template <class T>
-    void MarkAsModified(bool dirty, T& cache) {
-        is_dirty = dirty;
-        last_modified_ticks = cache.GetModifiedTicks();
-    }
-
-    void SetMemoryMarked(bool is_memory_marked_) {
-        is_memory_marked = is_memory_marked_;
-    }
-
-    bool IsMemoryMarked() const {
-        return is_memory_marked;
-    }
-
-    void SetSyncPending(bool is_sync_pending_) {
-        is_sync_pending = is_sync_pending_;
-    }
-
-    bool IsSyncPending() const {
-        return is_sync_pending;
-    }
-
-private:
-    bool is_registered{};      ///< Whether the object is currently registered with the cache
-    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
-    bool is_memory_marked{};   ///< Whether the object is marking rasterizer memory.
-    bool is_sync_pending{};    ///< Whether the object is pending deletion.
-    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
-    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
-};
-
-template <class T>
-class RasterizerCache : NonCopyable {
-    friend class RasterizerCacheObject;
-
-public:
-    explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
-
-    /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            FlushObject(object);
-        }
-    }
-
-    /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            if (!object->IsRegistered()) {
-                // Skip duplicates
-                continue;
-            }
-            Unregister(object);
-        }
-    }
-
-    void OnCPUWrite(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
-            if (object->IsRegistered()) {
-                UnmarkMemory(object);
-                object->SetSyncPending(true);
-                marked_for_unregister.emplace_back(object);
-            }
-        }
-    }
-
-    void SyncGuestHost() {
-        std::lock_guard lock{mutex};
-
-        for (const auto& object : marked_for_unregister) {
-            if (object->IsRegistered()) {
-                object->SetSyncPending(false);
-                Unregister(object);
-            }
-        }
-        marked_for_unregister.clear();
-    }
-
-    /// Invalidates everything in the cache
-    void InvalidateAll() {
-        std::lock_guard lock{mutex};
-
-        while (interval_cache.begin() != interval_cache.end()) {
-            Unregister(*interval_cache.begin()->second.begin());
-        }
-    }
-
-protected:
-    /// Tries to get an object from the cache with the specified cache address
-    T TryGet(VAddr addr) const {
-        const auto iter = map_cache.find(addr);
-        if (iter != map_cache.end())
-            return iter->second;
-        return nullptr;
-    }
-
-    /// Register an object into the cache
-    virtual void Register(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(true);
-        interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetCpuAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
-        object->SetMemoryMarked(true);
-    }
-
-    /// Unregisters an object from the cache
-    virtual void Unregister(const T& object) {
-        std::lock_guard lock{mutex};
-
-        UnmarkMemory(object);
-        object->SetIsRegistered(false);
-        if (object->IsSyncPending()) {
-            marked_for_unregister.remove(object);
-            object->SetSyncPending(false);
-        }
-        const VAddr addr = object->GetCpuAddr();
-        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(addr);
-    }
-
-    void UnmarkMemory(const T& object) {
-        if (!object->IsMemoryMarked()) {
-            return;
-        }
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
-        object->SetMemoryMarked(false);
-    }
-
-    /// Returns a ticks counter used for tracking when cached objects were last modified
-    u64 GetModifiedTicks() {
-        std::lock_guard lock{mutex};
-
-        return ++modified_ticks;
-    }
-
-    virtual void FlushObjectInner(const T& object) = 0;
-
-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        std::lock_guard lock{mutex};
-
-        if (!object->IsDirty()) {
-            return;
-        }
-        FlushObjectInner(object);
-        object->MarkAsModified(false, *this);
-    }
-
-    std::recursive_mutex mutex;
-
-private:
-    /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
-        if (size == 0) {
-            return {};
-        }
-
-        std::vector<T> objects;
-        const ObjectInterval interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
-            for (auto& cached_object : pair.second) {
-                if (!cached_object) {
-                    continue;
-                }
-                objects.push_back(cached_object);
-            }
-        }
-
-        std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
-            return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
-        });
-
-        return objects;
-    }
-
-    using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
-    using ObjectInterval = typename IntervalCache::interval_type;
-
-    static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetCpuAddr(),
-                                          object->GetCpuAddr() + object->GetSizeInBytes());
-    }
-
-    ObjectCache map_cache;
-    IntervalCache interval_cache; ///< Cache of objects
-    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
-    VideoCore::RasterizerInterface& rasterizer;
-    std::list<T> marked_for_unregister;
-};
diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp
index 919d1f2d4..dfb06e87e 100644
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -18,7 +18,7 @@ RendererBase::~RendererBase() = default;
 void RendererBase::RefreshBaseSettings() {
     UpdateCurrentFramebufferLayout();
 
-    renderer_settings.use_framelimiter = Settings::values.use_frame_limit;
+    renderer_settings.use_framelimiter = Settings::values.use_frame_limit.GetValue();
     renderer_settings.set_background_color = true;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..eb5158407
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2073 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+    ASSERT(component < 4);
+    return component["xyzw"];
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+    const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+    if (meta && meta->precise) {
+        return ".PREC";
+    }
+    return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+    switch (attribute) {
+    case PixelImap::Perspective:
+        return "";
+    case PixelImap::Constant:
+        return "FLAT ";
+    case PixelImap::ScreenLinear:
+        return "NOPERSPECTIVE ";
+    case PixelImap::Unused:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return "1D";
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return "BUFFER";
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return "ARRAY1D";
+    case Tegra::Shader::ImageType::Texture2D:
+        return "2D";
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return "ARRAY2D";
+    case Tegra::Shader::ImageType::Texture3D:
+        return "3D";
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "SSY";
+    case MetaStackClass::Pbk:
+        return "PBK";
+    }
+    UNREACHABLE();
+    return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+        return "POINTS";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+        return "LINES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+        return "LINES_ADJACENCY";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+        return "TRIANGLES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+        return "TRIANGLES_ADJACENCY";
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return "POINTS";
+    }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+    switch (topology) {
+    case Tegra::Shader::OutputTopology::PointList:
+        return "POINTS";
+    case Tegra::Shader::OutputTopology::LineStrip:
+        return "LINE_STRIP";
+    case Tegra::Shader::OutputTopology::TriangleStrip:
+        return "TRIANGLE_STRIP";
+    default:
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        return "points";
+    }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+    case ShaderType::Geometry:
+        return "vertex";
+    case ShaderType::Fragment:
+        return "fragment";
+    case ShaderType::Compute:
+        return "invocation";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+    if (meta.sampler.is_buffer) {
+        return "BUFFER";
+    }
+    std::string type;
+    if (meta.sampler.is_shadow) {
+        type += "SHADOW";
+    }
+    if (meta.sampler.is_array) {
+        type += "ARRAY";
+    }
+    type += [&meta] {
+        switch (meta.sampler.type) {
+        case Tegra::Shader::TextureType::Texture1D:
+            return "1D";
+        case Tegra::Shader::TextureType::Texture2D:
+            return "2D";
+        case Tegra::Shader::TextureType::Texture3D:
+            return "3D";
+        case Tegra::Shader::TextureType::TextureCube:
+            return "CUBE";
+        }
+        UNREACHABLE();
+        return "2D";
+    }();
+    return type;
+}
+
+std::string GlobalMemoryName(const GlobalMemoryBase& base) {
+    return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
+}
+
+class ARBDecompiler final {
+public:
+    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                           ShaderType stage, std::string_view identifier);
+
+    std::string Code() const {
+        return shader_source;
+    }
+
+private:
+    void DeclareHeader();
+    void DeclareVertex();
+    void DeclareGeometry();
+    void DeclareFragment();
+    void DeclareCompute();
+    void DeclareInputAttributes();
+    void DeclareOutputAttributes();
+    void DeclareLocalMemory();
+    void DeclareGlobalMemory();
+    void DeclareConstantBuffers();
+    void DeclareRegisters();
+    void DeclareTemporaries();
+    void DeclarePredicates();
+    void DeclareInternalFlags();
+
+    void InitializeVariables();
+
+    void DecompileAST();
+    void DecompileBranchMode();
+
+    void VisitAST(const ASTNode& node);
+    std::string VisitExpression(const Expr& node);
+
+    void VisitBlock(const NodeBlock& bb);
+
+    std::string Visit(const Node& node);
+
+    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::string BuildAoffi(Operation);
+    void Exit();
+
+    std::string Assign(Operation);
+    std::string Select(Operation);
+    std::string FClamp(Operation);
+    std::string FCastHalf0(Operation);
+    std::string FCastHalf1(Operation);
+    std::string FSqrt(Operation);
+    std::string FSwizzleAdd(Operation);
+    std::string HAdd2(Operation);
+    std::string HMul2(Operation);
+    std::string HFma2(Operation);
+    std::string HAbsolute(Operation);
+    std::string HNegate(Operation);
+    std::string HClamp(Operation);
+    std::string HCastFloat(Operation);
+    std::string HUnpack(Operation);
+    std::string HMergeF32(Operation);
+    std::string HMergeH0(Operation);
+    std::string HMergeH1(Operation);
+    std::string HPack2(Operation);
+    std::string LogicalAssign(Operation);
+    std::string LogicalPick2(Operation);
+    std::string LogicalAnd2(Operation);
+    std::string FloatOrdered(Operation);
+    std::string FloatUnordered(Operation);
+    std::string LogicalAddCarry(Operation);
+    std::string Texture(Operation);
+    std::string TextureGather(Operation);
+    std::string TextureQueryDimensions(Operation);
+    std::string TextureQueryLod(Operation);
+    std::string TexelFetch(Operation);
+    std::string TextureGradient(Operation);
+    std::string ImageLoad(Operation);
+    std::string ImageStore(Operation);
+    std::string Branch(Operation);
+    std::string BranchIndirect(Operation);
+    std::string PushFlowStack(Operation);
+    std::string PopFlowStack(Operation);
+    std::string Exit(Operation);
+    std::string Discard(Operation);
+    std::string EmitVertex(Operation);
+    std::string EndPrimitive(Operation);
+    std::string InvocationId(Operation);
+    std::string YNegate(Operation);
+    std::string ThreadId(Operation);
+    std::string ShuffleIndexed(Operation);
+    std::string Barrier(Operation);
+    std::string MemoryBarrierGroup(Operation);
+    std::string MemoryBarrierGlobal(Operation);
+
+    template <const std::string_view& op>
+    std::string Unary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Binary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Trinary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]), Visit(operation[2]));
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool unordered>
+    std::string FloatComparison(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+        AddLine("MOV.S {}, 0;", temporary);
+        AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        if constexpr (unordered) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+        } else if (op == SNE_F) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+        }
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool is_nan>
+    std::string HalfComparison(Operation operation) {
+        std::string tmp1 = AllocVectorTemporary();
+        const std::string tmp2 = AllocVectorTemporary();
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        AddLine("UP2H.F {}, {};", tmp1, op_a);
+        AddLine("UP2H.F {}, {};", tmp2, op_b);
+        AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+        AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+        AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+        AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+        AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+        if constexpr (is_nan) {
+            AddLine("MOVC.F RC.x, {};", op_a);
+            AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+            AddLine("MOVC.F RC.x, {};", op_b);
+            AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+        }
+        return tmp1;
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string AtomicImage(Operation operation) {
+        const auto& meta = std::get<MetaImage>(operation.GetMeta());
+        const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+        const std::size_t num_coords = operation.GetOperandsCount();
+        const std::size_t num_values = meta.values.size();
+
+        const std::string coord = AllocVectorTemporary();
+        const std::string value = AllocVectorTemporary();
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+        }
+        for (std::size_t i = 0; i < num_values; ++i) {
+            AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+        }
+
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+                image_id, ImageType(meta.image.type));
+        return fmt::format("{}.x", coord);
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string Atomic(Operation operation) {
+        std::string temporary = AllocTemporary();
+        std::string address;
+        std::string_view opname;
+        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                    Visit(gmem->GetBaseAddress()));
+            address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
+            opname = "ATOMB";
+        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+            opname = "ATOMS";
+        } else {
+            UNREACHABLE();
+            return "{0, 0, 0, 0}";
+        }
+        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        return temporary;
+    }
+
+    template <char type>
+    std::string Negate(Operation operation) {
+        std::string temporary = AllocTemporary();
+        if constexpr (type == 'F') {
+            AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+        } else {
+            AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Absolute(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <char type>
+    std::string BitfieldInsert(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+        AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+                Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char type>
+    std::string BitfieldExtract(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+        AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char swizzle>
+    std::string LocalInvocationId(Operation) {
+        return fmt::format("invocation.localid.{}", swizzle);
+    }
+
+    template <char swizzle>
+    std::string WorkGroupId(Operation) {
+        return fmt::format("invocation.groupid.{}", swizzle);
+    }
+
+    template <char c1, char c2>
+    std::string ThreadMask(Operation) {
+        return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+    }
+
+    template <typename... Args>
+    void AddExpression(std::string_view text, Args&&... args) {
+        shader_source += fmt::format(text, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(text, std::forward<Args>(args)...);
+        shader_source += '\n';
+    }
+
+    std::string AllocTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}.x", num_temporaries++);
+    }
+
+    std::string AllocVectorTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}", num_temporaries++);
+    }
+
+    void ResetTemporaries() noexcept {
+        num_temporaries = 0;
+    }
+
+    const Device& device;
+    const ShaderIR& ir;
+    const Registry& registry;
+    const ShaderType stage;
+
+    std::size_t num_temporaries = 0;
+    std::size_t max_temporaries = 0;
+
+    std::string shader_source;
+
+    static constexpr std::string_view ADD_F32 = "ADD.F32";
+    static constexpr std::string_view ADD_S = "ADD.S";
+    static constexpr std::string_view ADD_U = "ADD.U";
+    static constexpr std::string_view MUL_F32 = "MUL.F32";
+    static constexpr std::string_view MUL_S = "MUL.S";
+    static constexpr std::string_view MUL_U = "MUL.U";
+    static constexpr std::string_view DIV_F32 = "DIV.F32";
+    static constexpr std::string_view DIV_S = "DIV.S";
+    static constexpr std::string_view DIV_U = "DIV.U";
+    static constexpr std::string_view MAD_F32 = "MAD.F32";
+    static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+    static constexpr std::string_view COS_F32 = "COS.F32";
+    static constexpr std::string_view SIN_F32 = "SIN.F32";
+    static constexpr std::string_view EX2_F32 = "EX2.F32";
+    static constexpr std::string_view LG2_F32 = "LG2.F32";
+    static constexpr std::string_view SLT_F = "SLT.F32";
+    static constexpr std::string_view SLT_S = "SLT.S";
+    static constexpr std::string_view SLT_U = "SLT.U";
+    static constexpr std::string_view SEQ_F = "SEQ.F32";
+    static constexpr std::string_view SEQ_S = "SEQ.S";
+    static constexpr std::string_view SEQ_U = "SEQ.U";
+    static constexpr std::string_view SLE_F = "SLE.F32";
+    static constexpr std::string_view SLE_S = "SLE.S";
+    static constexpr std::string_view SLE_U = "SLE.U";
+    static constexpr std::string_view SGT_F = "SGT.F32";
+    static constexpr std::string_view SGT_S = "SGT.S";
+    static constexpr std::string_view SGT_U = "SGT.U";
+    static constexpr std::string_view SNE_F = "SNE.F32";
+    static constexpr std::string_view SNE_S = "SNE.S";
+    static constexpr std::string_view SNE_U = "SNE.U";
+    static constexpr std::string_view SGE_F = "SGE.F32";
+    static constexpr std::string_view SGE_S = "SGE.S";
+    static constexpr std::string_view SGE_U = "SGE.U";
+    static constexpr std::string_view AND_S = "AND.S";
+    static constexpr std::string_view AND_U = "AND.U";
+    static constexpr std::string_view TRUNC_F = "TRUNC.F";
+    static constexpr std::string_view TRUNC_S = "TRUNC.S";
+    static constexpr std::string_view TRUNC_U = "TRUNC.U";
+    static constexpr std::string_view SHL_S = "SHL.S";
+    static constexpr std::string_view SHL_U = "SHL.U";
+    static constexpr std::string_view SHR_S = "SHR.S";
+    static constexpr std::string_view SHR_U = "SHR.U";
+    static constexpr std::string_view OR_S = "OR.S";
+    static constexpr std::string_view OR_U = "OR.U";
+    static constexpr std::string_view XOR_S = "XOR.S";
+    static constexpr std::string_view XOR_U = "XOR.U";
+    static constexpr std::string_view NOT_S = "NOT.S";
+    static constexpr std::string_view NOT_U = "NOT.U";
+    static constexpr std::string_view BTC_S = "BTC.S";
+    static constexpr std::string_view BTC_U = "BTC.U";
+    static constexpr std::string_view BTFM_S = "BTFM.S";
+    static constexpr std::string_view BTFM_U = "BTFM.U";
+    static constexpr std::string_view ROUND_F = "ROUND.F";
+    static constexpr std::string_view CEIL_F = "CEIL.F";
+    static constexpr std::string_view FLR_F = "FLR.F";
+    static constexpr std::string_view I2F_S = "I2F.S";
+    static constexpr std::string_view I2F_U = "I2F.U";
+    static constexpr std::string_view MIN_F = "MIN.F";
+    static constexpr std::string_view MIN_S = "MIN.S";
+    static constexpr std::string_view MIN_U = "MIN.U";
+    static constexpr std::string_view MAX_F = "MAX.F";
+    static constexpr std::string_view MAX_S = "MAX.S";
+    static constexpr std::string_view MAX_U = "MAX.U";
+    static constexpr std::string_view MOV_U = "MOV.U";
+    static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+    static constexpr std::string_view TGALL_U = "TGALL.U";
+    static constexpr std::string_view TGANY_U = "TGANY.U";
+    static constexpr std::string_view TGEQ_U = "TGEQ.U";
+    static constexpr std::string_view EXCH = "EXCH";
+    static constexpr std::string_view ADD = "ADD";
+    static constexpr std::string_view MIN = "MIN";
+    static constexpr std::string_view MAX = "MAX";
+    static constexpr std::string_view AND = "AND";
+    static constexpr std::string_view OR = "OR";
+    static constexpr std::string_view XOR = "XOR";
+    static constexpr std::string_view U32 = "U32";
+    static constexpr std::string_view S32 = "S32";
+
+    static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+    using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+    static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+        &ARBDecompiler::Assign,
+
+        &ARBDecompiler::Select,
+
+        &ARBDecompiler::Binary<ADD_F32>,
+        &ARBDecompiler::Binary<MUL_F32>,
+        &ARBDecompiler::Binary<DIV_F32>,
+        &ARBDecompiler::Trinary<MAD_F32>,
+        &ARBDecompiler::Negate<'F'>,
+        &ARBDecompiler::Absolute<'F'>,
+        &ARBDecompiler::FClamp,
+        &ARBDecompiler::FCastHalf0,
+        &ARBDecompiler::FCastHalf1,
+        &ARBDecompiler::Binary<MIN_F>,
+        &ARBDecompiler::Binary<MAX_F>,
+        &ARBDecompiler::Unary<COS_F32>,
+        &ARBDecompiler::Unary<SIN_F32>,
+        &ARBDecompiler::Unary<EX2_F32>,
+        &ARBDecompiler::Unary<LG2_F32>,
+        &ARBDecompiler::Unary<RSQ_F32>,
+        &ARBDecompiler::FSqrt,
+        &ARBDecompiler::Unary<ROUND_F>,
+        &ARBDecompiler::Unary<FLR_F>,
+        &ARBDecompiler::Unary<CEIL_F>,
+        &ARBDecompiler::Unary<TRUNC_F>,
+        &ARBDecompiler::Unary<I2F_S>,
+        &ARBDecompiler::Unary<I2F_U>,
+        &ARBDecompiler::FSwizzleAdd,
+
+        &ARBDecompiler::Binary<ADD_S>,
+        &ARBDecompiler::Binary<MUL_S>,
+        &ARBDecompiler::Binary<DIV_S>,
+        &ARBDecompiler::Negate<'S'>,
+        &ARBDecompiler::Absolute<'S'>,
+        &ARBDecompiler::Binary<MIN_S>,
+        &ARBDecompiler::Binary<MAX_S>,
+
+        &ARBDecompiler::Unary<TRUNC_S>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_S>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_S>,
+        &ARBDecompiler::Binary<AND_S>,
+        &ARBDecompiler::Binary<OR_S>,
+        &ARBDecompiler::Binary<XOR_S>,
+        &ARBDecompiler::Unary<NOT_S>,
+        &ARBDecompiler::BitfieldInsert<'S'>,
+        &ARBDecompiler::BitfieldExtract<'S'>,
+        &ARBDecompiler::Unary<BTC_S>,
+        &ARBDecompiler::Unary<BTFM_S>,
+
+        &ARBDecompiler::Binary<ADD_U>,
+        &ARBDecompiler::Binary<MUL_U>,
+        &ARBDecompiler::Binary<DIV_U>,
+        &ARBDecompiler::Binary<MIN_U>,
+        &ARBDecompiler::Binary<MAX_U>,
+        &ARBDecompiler::Unary<TRUNC_U>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::BitfieldInsert<'U'>,
+        &ARBDecompiler::BitfieldExtract<'U'>,
+        &ARBDecompiler::Unary<BTC_U>,
+        &ARBDecompiler::Unary<BTFM_U>,
+
+        &ARBDecompiler::HAdd2,
+        &ARBDecompiler::HMul2,
+        &ARBDecompiler::HFma2,
+        &ARBDecompiler::HAbsolute,
+        &ARBDecompiler::HNegate,
+        &ARBDecompiler::HClamp,
+        &ARBDecompiler::HCastFloat,
+        &ARBDecompiler::HUnpack,
+        &ARBDecompiler::HMergeF32,
+        &ARBDecompiler::HMergeH0,
+        &ARBDecompiler::HMergeH1,
+        &ARBDecompiler::HPack2,
+
+        &ARBDecompiler::LogicalAssign,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::LogicalPick2,
+        &ARBDecompiler::LogicalAnd2,
+
+        &ARBDecompiler::FloatComparison<SLT_F, false>,
+        &ARBDecompiler::FloatComparison<SEQ_F, false>,
+        &ARBDecompiler::FloatComparison<SLE_F, false>,
+        &ARBDecompiler::FloatComparison<SGT_F, false>,
+        &ARBDecompiler::FloatComparison<SNE_F, false>,
+        &ARBDecompiler::FloatComparison<SGE_F, false>,
+        &ARBDecompiler::FloatOrdered,
+        &ARBDecompiler::FloatUnordered,
+        &ARBDecompiler::FloatComparison<SLT_F, true>,
+        &ARBDecompiler::FloatComparison<SEQ_F, true>,
+        &ARBDecompiler::FloatComparison<SLE_F, true>,
+        &ARBDecompiler::FloatComparison<SGT_F, true>,
+        &ARBDecompiler::FloatComparison<SNE_F, true>,
+        &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+        &ARBDecompiler::Binary<SLT_S>,
+        &ARBDecompiler::Binary<SEQ_S>,
+        &ARBDecompiler::Binary<SLE_S>,
+        &ARBDecompiler::Binary<SGT_S>,
+        &ARBDecompiler::Binary<SNE_S>,
+        &ARBDecompiler::Binary<SGE_S>,
+
+        &ARBDecompiler::Binary<SLT_U>,
+        &ARBDecompiler::Binary<SEQ_U>,
+        &ARBDecompiler::Binary<SLE_U>,
+        &ARBDecompiler::Binary<SGT_U>,
+        &ARBDecompiler::Binary<SNE_U>,
+        &ARBDecompiler::Binary<SGE_U>,
+
+        &ARBDecompiler::LogicalAddCarry,
+
+        &ARBDecompiler::HalfComparison<SLT_F, false>,
+        &ARBDecompiler::HalfComparison<SEQ_F, false>,
+        &ARBDecompiler::HalfComparison<SLE_F, false>,
+        &ARBDecompiler::HalfComparison<SGT_F, false>,
+        &ARBDecompiler::HalfComparison<SNE_F, false>,
+        &ARBDecompiler::HalfComparison<SGE_F, false>,
+        &ARBDecompiler::HalfComparison<SLT_F, true>,
+        &ARBDecompiler::HalfComparison<SEQ_F, true>,
+        &ARBDecompiler::HalfComparison<SLE_F, true>,
+        &ARBDecompiler::HalfComparison<SGT_F, true>,
+        &ARBDecompiler::HalfComparison<SNE_F, true>,
+        &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::TextureGather,
+        &ARBDecompiler::TextureQueryDimensions,
+        &ARBDecompiler::TextureQueryLod,
+        &ARBDecompiler::TexelFetch,
+        &ARBDecompiler::TextureGradient,
+
+        &ARBDecompiler::ImageLoad,
+        &ARBDecompiler::ImageStore,
+
+        &ARBDecompiler::AtomicImage<ADD, U32>,
+        &ARBDecompiler::AtomicImage<AND, U32>,
+        &ARBDecompiler::AtomicImage<OR, U32>,
+        &ARBDecompiler::AtomicImage<XOR, U32>,
+        &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, U32>,
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, S32>,
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Branch,
+        &ARBDecompiler::BranchIndirect,
+        &ARBDecompiler::PushFlowStack,
+        &ARBDecompiler::PopFlowStack,
+        &ARBDecompiler::Exit,
+        &ARBDecompiler::Discard,
+
+        &ARBDecompiler::EmitVertex,
+        &ARBDecompiler::EndPrimitive,
+
+        &ARBDecompiler::InvocationId,
+        &ARBDecompiler::YNegate,
+        &ARBDecompiler::LocalInvocationId<'x'>,
+        &ARBDecompiler::LocalInvocationId<'y'>,
+        &ARBDecompiler::LocalInvocationId<'z'>,
+        &ARBDecompiler::WorkGroupId<'x'>,
+        &ARBDecompiler::WorkGroupId<'y'>,
+        &ARBDecompiler::WorkGroupId<'z'>,
+
+        &ARBDecompiler::Unary<TGBALLOT_U>,
+        &ARBDecompiler::Unary<TGALL_U>,
+        &ARBDecompiler::Unary<TGANY_U>,
+        &ARBDecompiler::Unary<TGEQ_U>,
+
+        &ARBDecompiler::ThreadId,
+        &ARBDecompiler::ThreadMask<'e', 'q'>,
+        &ARBDecompiler::ThreadMask<'g', 'e'>,
+        &ARBDecompiler::ThreadMask<'g', 't'>,
+        &ARBDecompiler::ThreadMask<'l', 'e'>,
+        &ARBDecompiler::ThreadMask<'l', 't'>,
+        &ARBDecompiler::ShuffleIndexed,
+
+        &ARBDecompiler::Barrier,
+        &ARBDecompiler::MemoryBarrierGroup,
+        &ARBDecompiler::MemoryBarrierGlobal,
+    };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                             ShaderType stage, std::string_view identifier)
+    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+    AddLine("TEMP RC;");
+    AddLine("TEMP FSWZA[4];");
+    AddLine("TEMP FSWZB[4];");
+    if (ir.IsDecompiled()) {
+        DecompileAST();
+    } else {
+        DecompileBranchMode();
+    }
+    AddLine("END");
+
+    const std::string code = std::move(shader_source);
+    DeclareHeader();
+    DeclareVertex();
+    DeclareGeometry();
+    DeclareFragment();
+    DeclareCompute();
+    DeclareInputAttributes();
+    DeclareOutputAttributes();
+    DeclareLocalMemory();
+    DeclareGlobalMemory();
+    DeclareConstantBuffers();
+    DeclareRegisters();
+    DeclareTemporaries();
+    DeclarePredicates();
+    DeclareInternalFlags();
+
+    shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+        return "vp";
+    case ShaderType::Geometry:
+        return "gp";
+    case ShaderType::Fragment:
+        return "fp";
+    case ShaderType::Compute:
+        return "cp";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+void ARBDecompiler::DeclareHeader() {
+    AddLine("!!NV{}5.0", HeaderStageName(stage));
+    // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+    AddLine("OPTION NV_internal;");
+    AddLine("OPTION NV_gpu_program_fp64;");
+    AddLine("OPTION NV_shader_storage_buffer;");
+    AddLine("OPTION NV_shader_thread_group;");
+    if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+        AddLine("OPTION NV_shader_thread_shuffle;");
+    }
+    if (stage == ShaderType::Vertex) {
+        if (device.HasNvViewportArray2()) {
+            AddLine("OPTION NV_viewport_array2;");
+        }
+    }
+    if (stage == ShaderType::Fragment) {
+        AddLine("OPTION ARB_draw_buffers;");
+    }
+    if (device.HasImageLoadFormatted()) {
+        AddLine("OPTION EXT_shader_image_load_formatted;");
+    }
+}
+
+void ARBDecompiler::DeclareVertex() {
+    if (stage != ShaderType::Vertex) {
+        return;
+    }
+    AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+    if (stage != ShaderType::Geometry) {
+        return;
+    }
+    const auto& info = registry.GetGraphicsInfo();
+    const auto& header = ir.GetHeader();
+    AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+    AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+    AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+    AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+    if (stage != ShaderType::Fragment) {
+        return;
+    }
+    AddLine("OUTPUT result_color7 = result.color[7];");
+    AddLine("OUTPUT result_color6 = result.color[6];");
+    AddLine("OUTPUT result_color5 = result.color[5];");
+    AddLine("OUTPUT result_color4 = result.color[4];");
+    AddLine("OUTPUT result_color3 = result.color[3];");
+    AddLine("OUTPUT result_color2 = result.color[2];");
+    AddLine("OUTPUT result_color1 = result.color[1];");
+    AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+    if (stage != ShaderType::Compute) {
+        return;
+    }
+    const ComputeInfo& info = registry.GetComputeInfo();
+    AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+            info.workgroup_size[2]);
+    if (info.shared_memory_size_in_words > 0) {
+        const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+        AddLine("SHARED_MEMORY {};", size_in_bytes);
+        AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+    }
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    const std::string_view stage_name = StageInputName(stage);
+    for (const auto attribute : ir.GetInputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+
+        std::string_view suffix;
+        if (stage == ShaderType::Fragment) {
+            const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+            if (input_mode == PixelImap::Unused) {
+                return;
+            }
+            suffix = GetInputFlags(input_mode);
+        }
+        AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+                index);
+    }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+    }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+    u64 size = 0;
+    if (stage == ShaderType::Compute) {
+        size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+    } else {
+        size = ir.GetHeader().GetLocalMemorySize();
+    }
+    if (size == 0) {
+        return;
+    }
+    const u64 element_count = Common::AlignUp(size, 4) / 4;
+    AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+    u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
+    for (const auto& pair : ir.GetGlobalMemory()) {
+        const auto& base = pair.first;
+        AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+    u32 binding = 0;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("TEMP R{};", gpr);
+    }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+    for (std::size_t i = 0; i < max_temporaries; ++i) {
+        AddLine("TEMP T{};", i);
+    }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("TEMP P{};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+    for (const char* name : INTERNAL_FLAG_NAMES) {
+        AddLine("TEMP {};", name);
+    }
+}
+
+void ARBDecompiler::InitializeVariables() {
+    AddLine("MOV.F32 FSWZA[0], -1;");
+    AddLine("MOV.F32 FSWZA[1], 1;");
+    AddLine("MOV.F32 FSWZA[2], -1;");
+    AddLine("MOV.F32 FSWZA[3], 0;");
+    AddLine("MOV.F32 FSWZB[0], -1;");
+    AddLine("MOV.F32 FSWZB[1], -1;");
+    AddLine("MOV.F32 FSWZB[2], 1;");
+    AddLine("MOV.F32 FSWZB[3], -1;");
+
+    if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+        AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+    }
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+    }
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DecompileAST() {
+    const u32 num_flow_variables = ir.GetASTNumVariables();
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("TEMP F{};", i);
+    }
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+    }
+
+    InitializeVariables();
+
+    VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+    static constexpr u32 FLOW_STACK_SIZE = 20;
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP SSY_TOP;");
+        AddLine("TEMP PBK_TOP;");
+    }
+
+    AddLine("TEMP PC;");
+
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("MOV.U SSY_TOP.x, 0;");
+        AddLine("MOV.U PBK_TOP.x, 0;");
+    }
+
+    InitializeVariables();
+
+    const auto basic_block_end = ir.GetBasicBlocks().end();
+    auto basic_block_it = ir.GetBasicBlocks().begin();
+    const u32 first_address = basic_block_it->first;
+    AddLine("MOV.U PC.x, {};", first_address);
+
+    AddLine("REP;");
+
+    std::size_t num_blocks = 0;
+    while (basic_block_it != basic_block_end) {
+        const auto& [address, bb] = *basic_block_it;
+        ++num_blocks;
+
+        AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+        AddLine("IF NE.x;");
+
+        VisitBlock(bb);
+
+        ++basic_block_it;
+
+        if (basic_block_it != basic_block_end) {
+            const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+            if (!op || op->GetCode() != OperationCode::Branch) {
+                const u32 next_address = basic_block_it->first;
+                AddLine("MOV.U PC.x, {};", next_address);
+                AddLine("CONT;");
+            }
+        }
+
+        AddLine("ELSE;");
+    }
+    AddLine("RET;");
+    while (num_blocks--) {
+        AddLine("ENDIF;");
+    }
+
+    AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+    if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("IF NE.x;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("ENDIF;");
+    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+        AddLine("ELSE;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(ast->nodes);
+    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+        ResetTemporaries();
+    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+        AddLine("REP;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("BRK (NE.x);");
+        AddLine("ENDREP;");
+    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast->condition);
+        if (!is_true) {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("IF NE.x;");
+            ResetTemporaries();
+        }
+        if (ast->kills) {
+            AddLine("KIL TR;");
+        } else {
+            Exit();
+        }
+        if (!is_true) {
+            AddLine("ENDIF;");
+        }
+    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast->condition)) {
+            AddLine("BRK;");
+        } else {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("BRK (NE.x);");
+            ResetTemporaries();
+        }
+    } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+        // Nothing to do
+    } else {
+        UNREACHABLE();
+    }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+    if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprOr>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprNot>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+        return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+    }
+    if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+        return Visit(ir.GetConditionCode(expr->cc));
+    }
+    if (const auto expr = std::get_if<ExprVar>(&*node)) {
+        return fmt::format("F{}.x", expr->var_index);
+    }
+    if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+        return expr->value ? "0xffffffff" : "0";
+    }
+    if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+        return result;
+    }
+    UNREACHABLE();
+    return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+    for (const auto& node : bb) {
+        Visit(node);
+    }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        if (const auto amend_index = operation->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+        if (index >= OPERATION_DECOMPILERS.size()) {
+            UNREACHABLE_MSG("Out of bounds operation: {}", index);
+            return {};
+        }
+        const auto decompiler = OPERATION_DECOMPILERS[index];
+        if (decompiler == nullptr) {
+            UNREACHABLE_MSG("Undefined operation: {}", index);
+            return {};
+        }
+        return (this->*decompiler)(*operation);
+    }
+
+    if (const auto gpr = std::get_if<GprNode>(&*node)) {
+        const u32 index = gpr->GetIndex();
+        if (index == Register::ZeroIndex) {
+            return "{0, 0, 0, 0}.x";
+        }
+        return fmt::format("R{}.x", index);
+    }
+
+    if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+        return fmt::format("CV{}.x", cv->GetIndex());
+    }
+
+    if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+        return temporary;
+    }
+
+    if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        switch (const auto index = predicate->GetIndex(); index) {
+        case Tegra::Shader::Pred::UnusedIndex:
+            AddLine("MOV.S {}, -1;", temporary);
+            break;
+        case Tegra::Shader::Pred::NeverExecute:
+            AddLine("MOV.S {}, 0;", temporary);
+            break;
+        default:
+            AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+            break;
+        }
+        if (predicate->IsNegated()) {
+            AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+        }
+        return temporary;
+    }
+
+    if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+        if (abuf->IsPhysicalBuffer()) {
+            UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+            return "{0, 0, 0, 0}.x";
+        }
+
+        const auto buffer_index = [this, &abuf]() -> std::string {
+            if (stage != ShaderType::Geometry) {
+                return "";
+            }
+            return fmt::format("[{}]", Visit(abuf->GetBuffer()));
+        };
+
+        const Attribute::Index index = abuf->GetIndex();
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (index) {
+        case Attribute::Index::Position: {
+            if (stage == ShaderType::Geometry) {
+                return fmt::format("{}_position[{}].{}", StageInputName(stage),
+                                   Visit(abuf->GetBuffer()), swizzle);
+            } else {
+                return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+            }
+        }
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            ASSERT(stage == ShaderType::Vertex);
+            switch (element) {
+            case 2:
+                return "vertex.instance";
+            case 3:
+                return "vertex.id";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            break;
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "fragment.pointcoord.x";
+            case 1:
+                return "fragment.pointcoord.y";
+            }
+            UNIMPLEMENTED();
+            break;
+        case Attribute::Index::FrontFacing: {
+            ASSERT(stage == ShaderType::Fragment);
+            ASSERT(element == 3);
+            const std::string temporary = AllocVectorTemporary();
+            AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+            AddLine("MOV.U.CC RC.x, -RC;");
+            AddLine("MOV.S {}.x, 0;", temporary);
+            AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+            return fmt::format("{}.x", temporary);
+        }
+        default:
+            if (IsGenericAttribute(index)) {
+                if (stage == ShaderType::Geometry) {
+                    return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+                                       Visit(abuf->GetBuffer()), swizzle);
+                } else {
+                    return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+                                       GetGenericAttributeIndex(index), swizzle);
+                }
+            }
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            break;
+        }
+        return "{0, 0, 0, 0}.x";
+    }
+
+    if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+        std::string offset_string;
+        const auto& offset = cbuf->GetOffset();
+        if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+            offset_string = std::to_string(imm->GetValue());
+        } else {
+            offset_string = Visit(offset);
+        }
+        std::string temporary = AllocTemporary();
+        AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+        return temporary;
+    }
+
+    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        return temporary;
+    }
+
+    if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+        std::string temporary = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+        AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto smem = std::get_if<SmemNode>(&*node)) {
+        std::string temporary = Visit(smem->GetAddress());
+        AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    }
+
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        if (const auto amend_index = conditional->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+        AddLine("IF NE.x;");
+        VisitBlock(conditional->GetCode());
+        AddLine("ENDIF;");
+        return {};
+    }
+
+    if (const auto cmt = std::get_if<CommentNode>(&*node)) {
+        // Uncommenting this will generate invalid code. GLASM lacks comments.
+        // AddLine("// {}", cmt->GetText());
+        return {};
+    }
+
+    UNIMPLEMENTED();
+    return {};
+}
+
+std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
+                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
+
+    const std::size_t count = operation.GetOperandsCount();
+    std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    if (meta.sampler.is_array) {
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+    }
+    if (meta.sampler.is_shadow) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+    }
+    return {std::move(temporary), i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    if (meta.aoffi.empty()) {
+        return {};
+    }
+    const std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (auto& node : meta.aoffi) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+    }
+    return fmt::format(", offset({})", temporary);
+}
+
+void ARBDecompiler::Exit() {
+    if (stage != ShaderType::Fragment) {
+        AddLine("RET;");
+        return;
+    }
+
+    const auto safe_get_register = [this](u32 reg) -> std::string {
+        // TODO(Rodrigo): Replace with contains once C++20 releases
+        const auto& used_registers = ir.GetRegisters();
+        if (used_registers.find(reg) != used_registers.end()) {
+            return fmt::format("R{}.x", reg);
+        }
+        return "{0, 0, 0, 0}.x";
+    };
+
+    const auto& header = ir.GetHeader();
+    u32 current_reg = 0;
+    for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        for (u32 component = 0; component < 4; ++component) {
+            if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                continue;
+            }
+            AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+                    safe_get_register(current_reg));
+            ++current_reg;
+        }
+    }
+    if (header.ps.omap.depth) {
+        AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+    }
+
+    AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string dest_name;
+    if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+        if (gpr->GetIndex() == Register::ZeroIndex) {
+            // Writing to Register::ZeroIndex is a no op
+            return {};
+        }
+        dest_name = fmt::format("R{}.x", gpr->GetIndex());
+    } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (const Attribute::Index index = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            dest_name = fmt::format("result.position.{}", swizzle);
+            break;
+        case Attribute::Index::LayerViewportPointSize:
+            switch (element) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+            case 2:
+                if (!device.HasNvViewportArray2()) {
+                    LOG_ERROR(
+                        Render_OpenGL,
+                        "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+                    return {};
+                }
+                dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+                break;
+            case 3:
+                dest_name = "result.pointsize.x";
+                break;
+            }
+            break;
+        case Attribute::Index::ClipDistances0123:
+            dest_name = fmt::format("result.clip[{}].x", element);
+            break;
+        case Attribute::Index::ClipDistances4567:
+            dest_name = fmt::format("result.clip[{}].x", element + 4);
+            break;
+        default:
+            if (!IsGenericAttribute(index)) {
+                UNREACHABLE();
+                return {};
+            }
+            dest_name =
+                fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+            break;
+        }
+    } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+        const std::string address = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", address, address);
+        dest_name = fmt::format("lmem[{}].x", address);
+    } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+        AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+        ResetTemporaries();
+        return {};
+    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        ResetTemporaries();
+        return {};
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", dest_name, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+            Visit(operation[2]));
+    return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+    // 1.0f in hex, replace with std::bit_cast on C++20
+    static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+    std::string temporary = AllocTemporary();
+    const Node& value = operation[0];
+    const Node& low = operation[1];
+    const Node& high = operation[2];
+    const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+    if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+        AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+    } else {
+        AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+        AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+    }
+    return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+    AddLine("MOV {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+    AddLine("RCP.F32 {}, {};", temporary, temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+    AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+    AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+    AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+    AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+    AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+    AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    const std::string tmp3 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+    AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+    AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+    AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+    AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+    AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+    const std::string operand = Visit(operation[0]);
+    switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+    case Tegra::Shader::HalfType::H0_H1:
+        return operand;
+    case Tegra::Shader::HalfType::F32: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.U {}.x, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H0_H0: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H1_H1: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    }
+    UNREACHABLE();
+    return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string target;
+
+    if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+        ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+        const Tegra::Shader::Pred index = pred->GetIndex();
+        switch (index) {
+        case Tegra::Shader::Pred::NeverExecute:
+        case Tegra::Shader::Pred::UnusedIndex:
+            // Writing to these predicates is a no-op
+            return {};
+        }
+        target = fmt::format("P{}.x", static_cast<u64>(index));
+    } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", target, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+    AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const std::string op = Visit(operation[0]);
+    AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("MOV.S {} (NAN.x), 0;", temporary);
+    AddLine("MOV.S {} (NAN.y), 0;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("MOV.S {} (NAN.x), -1;", temporary);
+    AddLine("MOV.S {} (NAN.y), -1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("IF CF.x;");
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("ENDIF;");
+    return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string_view opcode = "TEX";
+    std::string extra;
+    if (meta.bias) {
+        ASSERT(!meta.lod);
+        opcode = "TXB";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+        } else {
+            const std::string bias = AllocTemporary();
+            AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+            extra = fmt::format(" {},", bias);
+        }
+    }
+    if (meta.lod) {
+        ASSERT(!meta.bias);
+        opcode = "TXL";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+        } else {
+            const std::string lod = AllocTemporary();
+            AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+            extra = fmt::format(" {},", lod);
+        }
+    }
+
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string comp;
+    if (!meta.sampler.is_shadow) {
+        const auto& immediate = std::get<ImmediateNode>(*meta.component);
+        comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+    }
+
+    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+    AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::size_t count = operation.GetOperandsCount();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+    AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+    AddLine("TRUNC.S {}, {};", temporary, temporary);
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    if (!meta.sampler.is_buffer) {
+        ASSERT(swizzle < 4);
+        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+    }
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+            BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const std::string ddx = AllocVectorTemporary();
+    const std::string ddy = AllocVectorTemporary();
+    const std::string coord = BuildCoords(operation).first;
+
+    const std::size_t num_components = meta.derivates.size() / 2;
+    for (std::size_t index = 0; index < num_components; ++index) {
+        const char swizzle = Swizzle(index);
+        AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+        AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+    }
+
+    const std::string_view result = coord;
+    AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+    return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t count = operation.GetOperandsCount();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string temporary = AllocVectorTemporary();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+    AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t num_coords = operation.GetOperandsCount();
+    const std::size_t num_values = meta.values.size();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string coord = AllocVectorTemporary();
+    const std::string value = AllocVectorTemporary();
+    for (std::size_t i = 0; i < num_coords; ++i) {
+        AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+    }
+    for (std::size_t i = 0; i < num_values; ++i) {
+        AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+    }
+    AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+    return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+    const auto target = std::get<ImmediateNode>(*operation[0]);
+    AddLine("MOV.U PC.x, {};", target.GetValue());
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+    AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+    const std::string_view stack_name = StackName(stack);
+    AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+    AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const std::string_view stack_name = StackName(stack);
+    AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+    Exit();
+    return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+    AddLine("KIL TR;");
+    return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+    AddLine("EMIT;");
+    return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+    AddLine("ENDPRIM;");
+    return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+    return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+    LOG_WARNING(Render_OpenGL, "(STUBBED)");
+    const std::string temporary = AllocTemporary();
+    AddLine("MOV.F {}, 1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+    return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        return Visit(operation[0]);
+    }
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+            Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+    if (!ir.IsDecompiled()) {
+        LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
+        return {};
+    }
+    AddLine("BAR;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+    AddLine("MEMBAR.CTA;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+    AddLine("MEMBAR;");
+    return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier) {
+    return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 9964ea894..e461e4c70 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,22 +22,53 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
     : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
+}
+
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+    const GLintptr gl_offset = static_cast<GLintptr>(offset);
+    if (read_buffer.handle == 0) {
+        read_buffer.Create();
+        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+                          GL_STREAM_READ);
+    }
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,44 +79,21 @@ OGLBufferCache::~OGLBufferCache() {
     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }
 
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a9e86cfc7..88fdc0536 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
@@ -24,57 +23,58 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
 
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    void Download(std::size_t offset, std::size_t size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size);
 
-    GLuint GetHandle() const {
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    OGLBuffer read_buffer;
+    u64 gpu_address = 0;
 };
 
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
     }
 
 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 466a911db..c1f20f0ab 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <optional>
 #include <vector>
 
@@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
 
 constexpr u32 NumStages = 5;
 
-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
-                                  GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
-                                  GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};
 
 constexpr std::array LimitSSBOs = {
-    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
     GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
 
-constexpr std::array LimitSamplers = {
-    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
 
-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
-                                    GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};
 
 template <typename T>
 T GetInteger(GLenum pname) {
@@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
     return std::exchange(base, base + amount);
 }
 
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
     std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
 
@@ -112,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -133,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
 }
 
 bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
     static constexpr std::array formats = {
         GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
         GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -149,25 +169,44 @@ bool IsASTCSupported() {
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
     };
-    return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
-               GLint supported;
-               glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
-                                     &supported);
-               return supported == GL_TRUE;
-           }) == formats.end();
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(target, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
 }
 
 } // Anonymous namespace
 
-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }
 
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -178,36 +217,43 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
-    has_fast_buffer_sub_data = is_nvidia;
-    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
-                           GLAD_GL_NV_compute_program5;
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
+                           GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
+                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
     LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
 
-    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
+    if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
         LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
     }
 }
 
 Device::Device(std::nullptr_t) {
-    uniform_buffer_alignment = 0;
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
     max_vertex_attributes = 16;
     max_varyings = 15;
     has_warp_intrinsics = true;
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
-    has_component_indexing_bug = false;
-    has_broken_compute = false;
-    has_precise_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index e915dbd86..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
     explicit Device();
     explicit Device(std::nullptr_t);
 
+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
     const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
         return base_bindings[stage_index];
     }
@@ -64,6 +68,14 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -80,14 +92,14 @@ public:
         return has_precise_bug;
     }
 
-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
     bool HasFastBufferSubData() const {
         return has_fast_buffer_sub_data;
     }
 
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
     bool UseAssemblyShaders() const {
         return use_assembly_shaders;
     }
@@ -96,7 +108,8 @@ private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
 
-    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
     std::size_t uniform_buffer_alignment{};
     std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
@@ -105,12 +118,14 @@ private:
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
-    bool has_broken_compute{};
     bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
     bool use_assembly_shaders{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 716d43e65..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -54,15 +55,34 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
 
 namespace {
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                ShaderType shader_type, std::size_t index = 0) {
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
-        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(tex_handle);
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
     }
+
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
     const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -87,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
     return buffer.size;
 }
 
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
@@ -104,6 +152,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
       screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
     CheckExtensions();
 
+    unified_uniform_buffer.Create();
+    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
     if (device.UseAssemblyShaders()) {
         glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
         for (const GLuint cbuf : staging_cbufs) {
@@ -143,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -162,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
@@ -181,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -196,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -218,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -235,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -273,7 +336,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             continue;
         }
 
-        Shader shader{shader_cache.GetStageProgram(program)};
+        Shader* const shader = shader_cache.GetStageProgram(program);
 
         if (device.UseAssemblyShaders()) {
             // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
@@ -567,7 +630,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
     // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = gpu.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }
 
     // Prepare vertex array format.
     SetupVertexFormat();
@@ -584,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -655,10 +727,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
     buffer_cache.Acquire();
     current_cbuf = 0;
 
@@ -837,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
     static constexpr std::array PARAMETER_LUT = {
         GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
         GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
@@ -846,41 +914,62 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
     const auto& shader_stage = stages[stage_index];
+    const auto& entries = shader->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
+    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
 
-    u32 binding =
-        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetEntries().const_buffers) {
-        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
+    const auto base_bindings = device.GetBaseBindings(stage_index);
+    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+    for (const auto& entry : entries.const_buffers) {
+        const u32 index = entry.GetIndex();
+        const auto& buffer = shader_stage.const_buffers[index];
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+                                           entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& entries = kernel->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
 
     u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().const_buffers) {
+    for (const auto& entry : entries.const_buffers) {
         const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
         const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
         Tegra::Engines::ConstBufferInfo buffer;
         buffer.address = config.Address();
         buffer.size = config.size;
         buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
 void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
                                         const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry) {
+                                        const ConstBufferEntry& entry, bool use_unified,
+                                        std::size_t unified_offset) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -889,23 +978,33 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
     // UBO alignment requirements.
     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
-    const auto alignment = device.GetUniformBufferAlignment();
-    auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                    device.HasFastBufferSubData());
-    if (!device.UseAssemblyShaders()) {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+    const GPUVAddr gpu_addr = buffer.address;
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+    if (device.UseAssemblyShaders()) {
+        UNIMPLEMENTED_IF(use_unified);
+        if (info.offset != 0) {
+            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
+        }
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
-    if (offset != 0) {
-        const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-        glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-        cbuf = staging_cbuf;
-        offset = 0;
+
+    if (use_unified) {
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
-    glBindBufferRangeNV(stage, binding, cbuf, offset, size);
 }
 
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
@@ -920,7 +1019,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
     }
 }
 
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -937,13 +1036,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -956,7 +1054,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
     }
 }
 
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
@@ -985,7 +1083,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
     }
 }
 
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).image;
     for (const auto& entry : shader->GetEntries().images) {
@@ -995,7 +1093,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
     }
 }
 
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : shader->GetEntries().images) {
@@ -1024,6 +1122,26 @@ void RasterizerOpenGL::SyncViewport() {
     const auto& regs = gpu.regs;
 
     const bool dirty_viewport = flags[Dirty::Viewports];
+    const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+    if (dirty_clip_control || flags[Dirty::FrontFace]) {
+        flags[Dirty::FrontFace] = false;
+
+        GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+        if (regs.screen_y_control.triangle_rast_flip != 0 &&
+            regs.viewport_transform[0].scale_y < 0.0f) {
+            switch (mode) {
+            case GL_CW:
+                mode = GL_CCW;
+                break;
+            case GL_CCW:
+                mode = GL_CW;
+                break;
+            }
+        }
+        glFrontFace(mode);
+    }
+
     if (dirty_viewport || flags[Dirty::ClipControl]) {
         flags[Dirty::ClipControl] = false;
 
@@ -1121,11 +1239,6 @@ void RasterizerOpenGL::SyncCullMode() {
             glDisable(GL_CULL_FACE);
         }
     }
-
-    if (flags[Dirty::FrontFace]) {
-        flags[Dirty::FrontFace] = false;
-        glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
-    }
 }
 
 void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1496,12 +1609,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
     oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }
 
+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = system.GPU().Maxwell3D().regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
+}
+
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
     const auto& regs = system.GPU().Maxwell3D().regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1528,6 +1699,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                           static_cast<GLsizeiptr>(size));
     }
 
+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
     glBeginTransformFeedback(GL_POINTS);
 }
 
@@ -1549,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 87f7fe159..4f082592f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
@@ -100,40 +99,41 @@ private:
     void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
 
     /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(const Shader& kernel);
+    void SetupComputeConstBuffers(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry);
+                          const ConstBufferEntry& entry, bool use_unified,
+                          std::size_t unified_offset);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
 
     /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(const Shader& kernel);
+    void SetupComputeGlobalMemory(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
     /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+    void SetupDrawTextures(std::size_t stage_index, Shader* shader);
 
     /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(const Shader& kernel);
+    void SetupComputeTextures(Shader* kernel);
 
     /// Configures a texture.
     void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
                       const SamplerEntry& entry);
 
     /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+    void SetupDrawImages(std::size_t stage_index, Shader* shader);
 
     /// Configures images in a compute shader.
-    void SetupComputeImages(const Shader& shader);
+    void SetupComputeImages(Shader* shader);
 
     /// Configures an image.
     void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -201,6 +201,10 @@ private:
     /// Syncs the framebuffer sRGB state to match the guest state
     void SyncFramebufferSRGB();
 
+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
     /// Begin a transform feedback
     void BeginTransformFeedback(GLenum primitive_mode);
 
@@ -253,6 +257,7 @@ private:
         Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
     std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
     std::size_t current_cbuf = 0;
+    OGLBuffer unified_uniform_buffer;
 
     /// Number of commands queued to the OpenGL driver. Reseted on flush.
     std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4cd0f36cf..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -147,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u
     auto program = std::make_shared<ProgramHandle>();
 
     if (device.UseAssemblyShaders()) {
-        const std::string arb = "Not implemented";
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
 
         GLuint& arb_prog = program->assembly_program.handle;
 
@@ -194,12 +197,9 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, ProgramSharedPtr program_)
-    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program_)} {
-    // Assign either the assembly program or source program. We can't have both.
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+               ProgramSharedPtr program_)
+    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
     handle = program->assembly_program.handle;
     if (handle == 0) {
         handle = program->source_program.handle;
@@ -207,16 +207,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
     ASSERT(handle != 0);
 }
 
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
 
-GLuint CachedShader::GetHandle() const {
+GLuint Shader::GetHandle() const {
     DEBUG_ASSERT(registry->IsConsistent());
     return handle;
 }
 
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
-                                           Maxwell::ShaderProgram program_type, ProgramCode code,
-                                           ProgramCode code_b) {
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
+                                                      Maxwell::ShaderProgram program_type,
+                                                      ProgramCode code, ProgramCode code_b) {
     const auto shader_type = GetShaderType(program_type);
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
@@ -241,11 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(
+        std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
 }
 
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+                                                       ProgramCode code) {
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
     auto& engine = params.system.GPU().KeplerCompute();
@@ -265,22 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                              MakeEntries(params.device, ir, ShaderType::Compute),
+                                              std::move(program)));
 }
 
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const PrecompiledShader& precompiled_shader,
-                                     std::size_t size_in_bytes) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
-                         precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+                                                const PrecompiledShader& precompiled_shader) {
+    return std::unique_ptr<Shader>(new Shader(
+        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
-      disk_cache{system} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
+      emu_window{emu_window}, device{device}, disk_cache{system} {}
+
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             PrecompiledShader shader;
             shader.program = std::move(program);
             shader.registry = std::move(registry);
-            shader.entries = MakeEntries(ir);
+            shader.entries = MakeEntries(device, ir, entry.type);
 
             std::scoped_lock lock{mutex};
             if (callback) {
@@ -434,7 +436,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
     return program;
 }
 
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
         return last_shaders[static_cast<std::size_t>(program)];
     }
@@ -444,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     // Look up shader in the cache based on address
     const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
-    Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader};
-    if (shader) {
+    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
         return last_shaders[static_cast<std::size_t>(program)] = shader;
     }
 
@@ -459,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> shader;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
-                                                     std::move(code_b));
+        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        shader = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(shader);
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
-        null_shader = shader;
+        null_shader = std::move(shader);
     }
 
-    return last_shaders[static_cast<std::size_t>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = result;
 }
 
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
     auto& memory_manager{system.GPU().MemoryManager()};
     const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
 
-    auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
-    if (kernel) {
+    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
         return kernel;
     }
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> kernel;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+        kernel = Shader::CreateKernelFromMemory(params, std::move(code));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        kernel = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(kernel);
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
-        null_kernel = kernel;
+        null_kernel = std::move(kernel);
     }
-    return kernel;
+    return result;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index b2ae8d7f9..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -35,12 +35,9 @@ class EmuWindow;
 
 namespace OpenGL {
 
-class CachedShader;
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct ProgramHandle {
@@ -64,62 +61,53 @@ struct ShaderParameters {
     u64 unique_identifier;
 };
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader final {
 public:
-    ~CachedShader();
+    ~Shader();
 
     /// Gets the GL program handle for the shader
     GLuint GetHandle() const;
 
-    /// Returns the size in bytes of the shader
-    std::size_t GetSizeInBytes() const override {
-        return size_in_bytes;
-    }
-
     /// Gets the shader entries for the shader
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
-    static Shader CreateStageFromMemory(const ShaderParameters& params,
-                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode program_code, ProgramCode program_code_b);
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+    static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
+                                                         Maxwell::ShaderProgram program_type,
+                                                         ProgramCode program_code,
+                                                         ProgramCode program_code_b);
+    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+                                                          ProgramCode code);
 
-    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const PrecompiledShader& precompiled_shader,
-                                  std::size_t size_in_bytes);
+    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+                                                   const PrecompiledShader& precompiled_shader);
 
 private:
-    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, ProgramSharedPtr program);
+    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+                    ProgramSharedPtr program);
 
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
-    std::size_t size_in_bytes = 0;
     ProgramSharedPtr program;
     GLuint handle = 0;
 };
 
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
     explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                Core::Frontend::EmuWindow& emu_window, const Device& device);
+    ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
-    Shader GetStageProgram(Maxwell::ShaderProgram program);
+    Shader* GetStageProgram(Maxwell::ShaderProgram program);
 
     /// Gets a compute kernel in the passed address
-    Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const Shader& object) override {}
+    Shader* GetComputeKernel(GPUVAddr code_addr);
 
 private:
     ProgramSharedPtr GeneratePrecompiledProgram(
@@ -132,10 +120,10 @@ private:
     ShaderDiskCacheOpenGL disk_cache;
     std::unordered_map<u64, PrecompiledShader> runtime_cache;
 
-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 9cb115959..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -61,8 +62,8 @@ struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
 
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
 
 constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
@@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }
 
+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+    // We waste one UBO for emulation
+    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+    return num_ubos > num_available_ubos;
+}
+
 struct GenericVaryingDescription {
     std::string name;
     u8 first_element = 0;
@@ -412,8 +420,9 @@ class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                             ShaderType stage, std::string_view identifier, std::string_view suffix)
-        : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+          suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+                                                      UseUnifiedUniforms(device, ir, stage)} {
         if (stage != ShaderType::Compute) {
             transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
         }
@@ -518,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -618,7 +630,9 @@ private:
                 break;
             }
         }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
@@ -647,6 +661,16 @@ private:
         --code.scope;
         code.AddLine("}};");
         code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
     }
 
     void DeclareRegisters() {
@@ -834,12 +858,24 @@ private:
     }
 
     void DeclareConstantBuffers() {
+        if (use_unified_uniforms) {
+            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+                                static_cast<u32>(ir.GetGlobalMemory().size());
+            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+                         binding);
+            code.AddLine("    uint cbufs[];");
+            code.AddLine("}};");
+            code.AddNewLine();
+            return;
+        }
+
         u32 binding = device.GetBaseBindings(stage).uniform_buffer;
-        for (const auto& buffers : ir.GetConstantBuffers()) {
-            const auto index = buffers.first;
+        for (const auto [index, info] : ir.GetConstantBuffers()) {
+            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+            const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
             code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
                          GetConstBufferBlock(index));
-            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), size);
             code.AddLine("}};");
             code.AddNewLine();
         }
@@ -877,13 +913,13 @@ private:
                     return "samplerBuffer";
                 }
                 switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
@@ -1038,42 +1074,51 @@ private:
 
         if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
             const Node offset = cbuf->GetOffset();
+            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
             if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
-                        Type::Uint};
+                if (use_unified_uniforms) {
+                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+                            Type::Uint};
+                } else {
+                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                            Type::Uint};
+                }
             }
 
-            if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+            // Indirect access
+            if (use_unified_uniforms) {
+                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+                                    Visit(offset).AsUint()),
+                        Type::Uint};
+            }
 
-                if (!device.HasComponentIndexingBug()) {
-                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                        final_offset, final_offset),
-                            Type::Uint};
-                }
+            const std::string final_offset = code.GenerateTemporary();
+            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
 
-                // AMD's proprietary GLSL compiler emits ill code for variable component access.
-                // To bypass this driver bug generate 4 ifs, one per each component.
-                const std::string pack = code.GenerateTemporary();
-                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
-                             final_offset);
-
-                const std::string result = code.GenerateTemporary();
-                code.AddLine("uint {};", result);
-                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
-                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
-                                 pack, GetSwizzle(swizzle));
-                }
-                return {result, Type::Uint};
+            if (!device.HasComponentIndexingBug()) {
+                return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                    final_offset, final_offset),
+                        Type::Uint};
             }
 
-            UNREACHABLE_MSG("Unmanaged offset node type");
+            // AMD's proprietary GLSL compiler emits ill code for variable component access.
+            // To bypass this driver bug generate 4 ifs, one per each component.
+            const std::string pack = code.GenerateTemporary();
+            code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                         final_offset);
+
+            const std::string result = code.GenerateTemporary();
+            code.AddLine("uint {};", result);
+            for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+                             GetSwizzle(swizzle));
+            }
+            return {result, Type::Uint};
         }
 
         if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1339,8 +1384,19 @@ private:
         const std::size_t count = operation.GetOperandsCount();
         const bool has_array = meta->sampler.is_array;
         const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
@@ -1374,6 +1430,16 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -2000,8 +2066,19 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
         if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
@@ -2710,6 +2787,7 @@ private:
     const std::string_view identifier;
     const std::string_view suffix;
     const Header header;
+    const bool use_unified_uniforms;
     std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     ShaderWriter code;
@@ -2905,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() {
 
 } // Anonymous namespace
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
     ShaderEntries entries;
     for (const auto& cbuf : ir.GetConstantBuffers()) {
         entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2926,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
         entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
     }
     entries.shader_length = ir.GetLength();
+    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
     return entries;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e8a178764..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
     std::vector<GlobalMemoryEntry> global_memory_entries;
     std::vector<SamplerEntry> samplers;
     std::vector<ImageEntry> images;
-    u32 clip_distances{};
     std::size_t shader_length{};
+    u32 clip_distances{};
+    bool use_unified_uniforms{};
 };
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                          Tegra::Engines::ShaderType stage);
 
 std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
                             const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..2dcc2b0eb 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
 
 namespace {
 
+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;
 
 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
     u32 value = 0;
 };
 
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
     u32 cbuf = 0;
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     u32 texture_handler_size_value;
     u32 num_keys;
     u32 num_bound_samplers;
+    u32 num_separate_samplers;
     u32 num_bindless_samplers;
     if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
         file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
         file.ReadArray(&texture_handler_size_value, 1) != 1 ||
         file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
         file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
         file.ReadArray(&num_bindless_samplers, 1) != 1) {
         return false;
     }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     }
 
     std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
     if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
         file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
             flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
         file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
             flat_bindless_samplers.size()) {
         return false;
     }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
     }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
     }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
+    }
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
     }
 
     return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
         file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
         return false;
     }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
     }
 
-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
     flat_bound_samplers.reserve(bound_samplers.size());
     for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
+    }
+
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
     }
 
-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
     flat_bindless_samplers.reserve(bindless_samplers.size());
     for (const auto& [address, sampler] : bindless_samplers) {
         flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
     }
 
     return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
            file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
                flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
            file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
                flat_bindless_samplers.size();
 }
@@ -179,7 +213,7 @@ ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
 std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
     // Skip games without title id
     const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
-    if (!Settings::values.use_disk_shader_cache || !has_title_id) {
+    if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) {
         return {};
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
     VideoCommon::Shader::ComputeInfo compute_info;
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
     ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
     /*
      * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
      * and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
 
     void Unmap(GLsizeiptr size);
 
+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
+        return buffer_size;
+    }
+
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 4faa8b90c..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
     target = GetTextureTarget(params.target);
     texture = CreateTexture(params, target, internal_format, texture_buffer);
     DecorateSurfaceName();
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
-        true);
+
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+
+    main_view =
+        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -404,8 +409,7 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 
 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
                                      bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface},
-      format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format},
+    : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
       target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
     if (!is_proxy) {
         main_view = CreateTextureView();
@@ -414,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
     ASSERT(params.num_levels == 1);
 
+    if (params.target == SurfaceTarget::Texture3D) {
+        if (params.num_layers > 1) {
+            ASSERT(params.base_layer == 0);
+            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+        } else {
+            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+                                   params.base_level, params.base_layer);
+        }
+        return;
+    }
+
     if (params.num_layers > 1) {
-        // Layered framebuffer attachments
         UNIMPLEMENTED_IF(params.base_layer != 0);
-
-        switch (params.target) {
-        case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), 0);
-            break;
-        default:
-            UNIMPLEMENTED();
-        }
+        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
         return;
     }
 
@@ -435,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     const GLuint texture = surface.GetTexture();
     switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture1DArray:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
                                   params.base_layer);
         break;
     default:
@@ -501,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
     OGLTextureView texture_view;
     texture_view.Create();
 
-    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
-                  params.num_levels, params.base_layer, params.num_layers);
+    if (target == GL_TEXTURE_3D) {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, 0, 1);
+    } else {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, params.base_layer, params.num_layers);
+    }
     ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
 
     return texture_view;
@@ -545,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                    const Tegra::Engines::Fermi2D::Config& copy_config) {
     const auto& src_params{src_view->GetSurfaceParams()};
     const auto& dst_params{dst_view->GetSurfaceParams()};
-    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
-    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+    UNIMPLEMENTED_IF(src_params.depth != 1);
+    UNIMPLEMENTED_IF(dst_params.depth != 1);
 
     state_tracker.NotifyScissor0();
     state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 8a2ac8603..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,8 +80,10 @@ public:
     explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
     ~CachedSurfaceView();
 
-    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
-    void Attach(GLenum attachment, GLenum target) const;
+    /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+    /// @param attachment   Attachment to bind textures to
+    /// @param fb_target    Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+    void Attach(GLenum attachment, GLenum fb_target) const;
 
     GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                       Tegra::Texture::SwizzleSource y_source,
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..fe9bd4b5a 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -46,12 +47,11 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedInt:
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -70,10 +70,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,46 +84,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +101,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,33 +143,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
         break;
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
         break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +191,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
+    case Tegra::Texture::WrapMode::MirrorOnceClampOGL:
+        if (GL_EXT_texture_mirror_clamp) {
+            return GL_MIRROR_CLAMP_EXT;
+        } else {
+            return GL_MIRROR_CLAMP_TO_EDGE;
+        }
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +221,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -277,7 +243,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -341,7 +307,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -361,7 +327,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -392,7 +358,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -423,7 +389,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -434,7 +400,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -447,7 +413,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -486,7 +452,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6b489e6db..e66cdc083 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -455,8 +455,8 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
 void RendererOpenGL::InitOpenGLObjects() {
     frame_mailbox = std::make_unique<FrameMailbox>();
 
-    glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                 0.0f);
+    glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                 Settings::values.bg_blue.GetValue(), 0.0f);
 
     // Create shader programs
     OGLShader vertex_shader;
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -552,8 +561,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
 void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     if (renderer_settings.set_background_color) {
         // Update background color before drawing
-        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                     0.0f);
+        glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                     Settings::values.bg_blue.GetValue(), 0.0f);
     }
 
     // Set projection matrix
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
@@ -751,8 +766,9 @@ void RendererOpenGL::RenderScreenshot() {
 }
 
 bool RendererOpenGL::Init() {
-    if (GLAD_GL_KHR_debug) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
         glDebugMessageCallback(DebugHandler, nullptr);
     }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index 568744e3c..d1f0ea932 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -39,53 +39,18 @@ constexpr std::array POLYGON_OFFSET_ENABLE_LUT = {
 
 } // Anonymous namespace
 
-void FixedPipelineState::DepthStencil::Fill(const Maxwell& regs) noexcept {
-    raw = 0;
-    front.action_stencil_fail.Assign(PackStencilOp(regs.stencil_front_op_fail));
-    front.action_depth_fail.Assign(PackStencilOp(regs.stencil_front_op_zfail));
-    front.action_depth_pass.Assign(PackStencilOp(regs.stencil_front_op_zpass));
-    front.test_func.Assign(PackComparisonOp(regs.stencil_front_func_func));
-    if (regs.stencil_two_side_enable) {
-        back.action_stencil_fail.Assign(PackStencilOp(regs.stencil_back_op_fail));
-        back.action_depth_fail.Assign(PackStencilOp(regs.stencil_back_op_zfail));
-        back.action_depth_pass.Assign(PackStencilOp(regs.stencil_back_op_zpass));
-        back.test_func.Assign(PackComparisonOp(regs.stencil_back_func_func));
-    } else {
-        back.action_stencil_fail.Assign(front.action_stencil_fail);
-        back.action_depth_fail.Assign(front.action_depth_fail);
-        back.action_depth_pass.Assign(front.action_depth_pass);
-        back.test_func.Assign(front.test_func);
-    }
-    depth_test_enable.Assign(regs.depth_test_enable);
-    depth_write_enable.Assign(regs.depth_write_enabled);
-    depth_bounds_enable.Assign(regs.depth_bounds_enable);
-    stencil_enable.Assign(regs.stencil_enable);
-    depth_test_func.Assign(PackComparisonOp(regs.depth_test_func));
-}
-
-void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
+void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_state) {
     const auto& clip = regs.view_volume_clip_control;
     const std::array enabled_lut = {regs.polygon_offset_point_enable,
                                     regs.polygon_offset_line_enable,
                                     regs.polygon_offset_fill_enable};
     const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
 
-    u32 packed_front_face = PackFrontFace(regs.front_face);
-    if (regs.screen_y_control.triangle_rast_flip != 0 &&
-        regs.viewport_transform[0].scale_y > 0.0f) {
-        // Flip front face
-        packed_front_face = 1 - packed_front_face;
-    }
-
     raw = 0;
-    topology.Assign(topology_index);
     primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0);
-    cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0);
     depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0);
     depth_clamp_disabled.Assign(regs.view_volume_clip_control.depth_clamp_disabled.Value());
     ndc_minus_one_to_one.Assign(regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1 : 0);
-    cull_face.Assign(PackCullFace(regs.cull_face));
-    front_face.Assign(packed_front_face);
     polygon_mode.Assign(PackPolygonMode(regs.polygon_mode_front));
     patch_control_points_minus_one.Assign(regs.patch_vertices - 1);
     tessellation_primitive.Assign(static_cast<u32>(regs.tess_mode.prim.Value()));
@@ -94,19 +59,37 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
     logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0);
     logic_op.Assign(PackLogicOp(regs.logic_op.operation));
     rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0);
+
     std::memcpy(&point_size, &regs.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast
-}
 
-void FixedPipelineState::ColorBlending::Fill(const Maxwell& regs) noexcept {
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        binding_divisors[index] =
+            regs.instanced_arrays.IsInstancingEnabled(index) ? regs.vertex_array[index].divisor : 0;
+    }
+
+    for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
+        const auto& input = regs.vertex_attrib_format[index];
+        auto& attribute = attributes[index];
+        attribute.raw = 0;
+        attribute.enabled.Assign(input.IsConstant() ? 0 : 1);
+        attribute.buffer.Assign(input.buffer);
+        attribute.offset.Assign(input.offset);
+        attribute.type.Assign(static_cast<u32>(input.type.Value()));
+        attribute.size.Assign(static_cast<u32>(input.size.Value()));
+    }
+
     for (std::size_t index = 0; index < std::size(attachments); ++index) {
         attachments[index].Fill(regs, index);
     }
-}
 
-void FixedPipelineState::ViewportSwizzles::Fill(const Maxwell& regs) noexcept {
     const auto& transform = regs.viewport_transform;
-    std::transform(transform.begin(), transform.end(), swizzles.begin(),
+    std::transform(transform.begin(), transform.end(), viewport_swizzles.begin(),
                    [](const auto& viewport) { return static_cast<u16>(viewport.swizzle.raw); });
+
+    if (!has_extended_dynamic_state) {
+        no_extended_dynamic_state.Assign(1);
+        dynamic_state.Fill(regs);
+    }
 }
 
 void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size_t index) {
@@ -148,20 +131,57 @@ void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size
     enable.Assign(1);
 }
 
-void FixedPipelineState::Fill(const Maxwell& regs) {
-    rasterizer.Fill(regs);
-    depth_stencil.Fill(regs);
-    color_blending.Fill(regs);
-    viewport_swizzles.Fill(regs);
+void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) {
+    const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
+    u32 packed_front_face = PackFrontFace(regs.front_face);
+    if (regs.screen_y_control.triangle_rast_flip != 0) {
+        // Flip front face
+        packed_front_face = 1 - packed_front_face;
+    }
+
+    raw1 = 0;
+    raw2 = 0;
+    front.action_stencil_fail.Assign(PackStencilOp(regs.stencil_front_op_fail));
+    front.action_depth_fail.Assign(PackStencilOp(regs.stencil_front_op_zfail));
+    front.action_depth_pass.Assign(PackStencilOp(regs.stencil_front_op_zpass));
+    front.test_func.Assign(PackComparisonOp(regs.stencil_front_func_func));
+    if (regs.stencil_two_side_enable) {
+        back.action_stencil_fail.Assign(PackStencilOp(regs.stencil_back_op_fail));
+        back.action_depth_fail.Assign(PackStencilOp(regs.stencil_back_op_zfail));
+        back.action_depth_pass.Assign(PackStencilOp(regs.stencil_back_op_zpass));
+        back.test_func.Assign(PackComparisonOp(regs.stencil_back_func_func));
+    } else {
+        back.action_stencil_fail.Assign(front.action_stencil_fail);
+        back.action_depth_fail.Assign(front.action_depth_fail);
+        back.action_depth_pass.Assign(front.action_depth_pass);
+        back.test_func.Assign(front.test_func);
+    }
+    stencil_enable.Assign(regs.stencil_enable);
+    depth_write_enable.Assign(regs.depth_write_enabled);
+    depth_bounds_enable.Assign(regs.depth_bounds_enable);
+    depth_test_enable.Assign(regs.depth_test_enable);
+    front_face.Assign(packed_front_face);
+    depth_test_func.Assign(PackComparisonOp(regs.depth_test_func));
+    topology.Assign(topology_index);
+    cull_face.Assign(PackCullFace(regs.cull_face));
+    cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0);
+
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        const auto& input = regs.vertex_array[index];
+        VertexBinding& binding = vertex_bindings[index];
+        binding.raw = 0;
+        binding.enabled.Assign(input.IsEnabled() ? 1 : 0);
+        binding.stride.Assign(static_cast<u16>(input.stride.Value()));
+    }
 }
 
 std::size_t FixedPipelineState::Hash() const noexcept {
-    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this);
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size());
     return static_cast<std::size_t>(hash);
 }
 
 bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept {
-    return std::memcmp(this, &rhs, sizeof *this) == 0;
+    return std::memcmp(this, &rhs, Size()) == 0;
 }
 
 u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept {
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
index 31a6398f2..cdcbb65f5 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -60,14 +60,6 @@ struct FixedPipelineState {
 
         void Fill(const Maxwell& regs, std::size_t index);
 
-        std::size_t Hash() const noexcept;
-
-        bool operator==(const BlendingAttachment& rhs) const noexcept;
-
-        bool operator!=(const BlendingAttachment& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-
         constexpr std::array<bool, 4> Mask() const noexcept {
             return {mask_r != 0, mask_g != 0, mask_b != 0, mask_a != 0};
         }
@@ -97,156 +89,116 @@ struct FixedPipelineState {
         }
     };
 
-    struct VertexInput {
-        union Binding {
-            u16 raw;
-            BitField<0, 1, u16> enabled;
-            BitField<1, 12, u16> stride;
-        };
+    union VertexAttribute {
+        u32 raw;
+        BitField<0, 1, u32> enabled;
+        BitField<1, 5, u32> buffer;
+        BitField<6, 14, u32> offset;
+        BitField<20, 3, u32> type;
+        BitField<23, 6, u32> size;
 
-        union Attribute {
-            u32 raw;
-            BitField<0, 1, u32> enabled;
-            BitField<1, 5, u32> buffer;
-            BitField<6, 14, u32> offset;
-            BitField<20, 3, u32> type;
-            BitField<23, 6, u32> size;
-
-            constexpr Maxwell::VertexAttribute::Type Type() const noexcept {
-                return static_cast<Maxwell::VertexAttribute::Type>(type.Value());
-            }
-
-            constexpr Maxwell::VertexAttribute::Size Size() const noexcept {
-                return static_cast<Maxwell::VertexAttribute::Size>(size.Value());
-            }
-        };
-
-        std::array<Binding, Maxwell::NumVertexArrays> bindings;
-        std::array<u32, Maxwell::NumVertexArrays> binding_divisors;
-        std::array<Attribute, Maxwell::NumVertexAttributes> attributes;
-
-        void SetBinding(std::size_t index, bool enabled, u32 stride, u32 divisor) noexcept {
-            auto& binding = bindings[index];
-            binding.raw = 0;
-            binding.enabled.Assign(enabled ? 1 : 0);
-            binding.stride.Assign(static_cast<u16>(stride));
-            binding_divisors[index] = divisor;
+        constexpr Maxwell::VertexAttribute::Type Type() const noexcept {
+            return static_cast<Maxwell::VertexAttribute::Type>(type.Value());
         }
 
-        void SetAttribute(std::size_t index, bool enabled, u32 buffer, u32 offset,
-                          Maxwell::VertexAttribute::Type type,
-                          Maxwell::VertexAttribute::Size size) noexcept {
-            auto& attribute = attributes[index];
-            attribute.raw = 0;
-            attribute.enabled.Assign(enabled ? 1 : 0);
-            attribute.buffer.Assign(buffer);
-            attribute.offset.Assign(offset);
-            attribute.type.Assign(static_cast<u32>(type));
-            attribute.size.Assign(static_cast<u32>(size));
+        constexpr Maxwell::VertexAttribute::Size Size() const noexcept {
+            return static_cast<Maxwell::VertexAttribute::Size>(size.Value());
         }
     };
 
-    struct Rasterizer {
-        union {
-            u32 raw;
-            BitField<0, 4, u32> topology;
-            BitField<4, 1, u32> primitive_restart_enable;
-            BitField<5, 1, u32> cull_enable;
-            BitField<6, 1, u32> depth_bias_enable;
-            BitField<7, 1, u32> depth_clamp_disabled;
-            BitField<8, 1, u32> ndc_minus_one_to_one;
-            BitField<9, 2, u32> cull_face;
-            BitField<11, 1, u32> front_face;
-            BitField<12, 2, u32> polygon_mode;
-            BitField<14, 5, u32> patch_control_points_minus_one;
-            BitField<19, 2, u32> tessellation_primitive;
-            BitField<21, 2, u32> tessellation_spacing;
-            BitField<23, 1, u32> tessellation_clockwise;
-            BitField<24, 1, u32> logic_op_enable;
-            BitField<25, 4, u32> logic_op;
-            BitField<29, 1, u32> rasterize_enable;
-        };
-
-        // TODO(Rodrigo): Move this to push constants
-        u32 point_size;
+    template <std::size_t Position>
+    union StencilFace {
+        BitField<Position + 0, 3, u32> action_stencil_fail;
+        BitField<Position + 3, 3, u32> action_depth_fail;
+        BitField<Position + 6, 3, u32> action_depth_pass;
+        BitField<Position + 9, 3, u32> test_func;
 
-        void Fill(const Maxwell& regs) noexcept;
+        Maxwell::StencilOp ActionStencilFail() const noexcept {
+            return UnpackStencilOp(action_stencil_fail);
+        }
 
-        constexpr Maxwell::PrimitiveTopology Topology() const noexcept {
-            return static_cast<Maxwell::PrimitiveTopology>(topology.Value());
+        Maxwell::StencilOp ActionDepthFail() const noexcept {
+            return UnpackStencilOp(action_depth_fail);
         }
 
-        Maxwell::CullFace CullFace() const noexcept {
-            return UnpackCullFace(cull_face.Value());
+        Maxwell::StencilOp ActionDepthPass() const noexcept {
+            return UnpackStencilOp(action_depth_pass);
         }
 
-        Maxwell::FrontFace FrontFace() const noexcept {
-            return UnpackFrontFace(front_face.Value());
+        Maxwell::ComparisonOp TestFunc() const noexcept {
+            return UnpackComparisonOp(test_func);
         }
     };
 
-    struct DepthStencil {
-        template <std::size_t Position>
-        union StencilFace {
-            BitField<Position + 0, 3, u32> action_stencil_fail;
-            BitField<Position + 3, 3, u32> action_depth_fail;
-            BitField<Position + 6, 3, u32> action_depth_pass;
-            BitField<Position + 9, 3, u32> test_func;
-
-            Maxwell::StencilOp ActionStencilFail() const noexcept {
-                return UnpackStencilOp(action_stencil_fail);
-            }
-
-            Maxwell::StencilOp ActionDepthFail() const noexcept {
-                return UnpackStencilOp(action_depth_fail);
-            }
-
-            Maxwell::StencilOp ActionDepthPass() const noexcept {
-                return UnpackStencilOp(action_depth_pass);
-            }
-
-            Maxwell::ComparisonOp TestFunc() const noexcept {
-                return UnpackComparisonOp(test_func);
-            }
-        };
+    union VertexBinding {
+        u16 raw;
+        BitField<0, 12, u16> stride;
+        BitField<12, 1, u16> enabled;
+    };
 
+    struct DynamicState {
         union {
-            u32 raw;
+            u32 raw1;
             StencilFace<0> front;
             StencilFace<12> back;
-            BitField<24, 1, u32> depth_test_enable;
+            BitField<24, 1, u32> stencil_enable;
             BitField<25, 1, u32> depth_write_enable;
             BitField<26, 1, u32> depth_bounds_enable;
-            BitField<27, 1, u32> stencil_enable;
-            BitField<28, 3, u32> depth_test_func;
+            BitField<27, 1, u32> depth_test_enable;
+            BitField<28, 1, u32> front_face;
+            BitField<29, 3, u32> depth_test_func;
+        };
+        union {
+            u32 raw2;
+            BitField<0, 4, u32> topology;
+            BitField<4, 2, u32> cull_face;
+            BitField<6, 1, u32> cull_enable;
         };
+        std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings;
 
-        void Fill(const Maxwell& regs) noexcept;
+        void Fill(const Maxwell& regs);
 
         Maxwell::ComparisonOp DepthTestFunc() const noexcept {
             return UnpackComparisonOp(depth_test_func);
         }
-    };
-
-    struct ColorBlending {
-        std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments;
 
-        void Fill(const Maxwell& regs) noexcept;
-    };
+        Maxwell::CullFace CullFace() const noexcept {
+            return UnpackCullFace(cull_face.Value());
+        }
 
-    struct ViewportSwizzles {
-        std::array<u16, Maxwell::NumViewports> swizzles;
+        Maxwell::FrontFace FrontFace() const noexcept {
+            return UnpackFrontFace(front_face.Value());
+        }
 
-        void Fill(const Maxwell& regs) noexcept;
+        constexpr Maxwell::PrimitiveTopology Topology() const noexcept {
+            return static_cast<Maxwell::PrimitiveTopology>(topology.Value());
+        }
     };
 
-    VertexInput vertex_input;
-    Rasterizer rasterizer;
-    DepthStencil depth_stencil;
-    ColorBlending color_blending;
-    ViewportSwizzles viewport_swizzles;
+    union {
+        u32 raw;
+        BitField<0, 1, u32> no_extended_dynamic_state;
+        BitField<2, 1, u32> primitive_restart_enable;
+        BitField<3, 1, u32> depth_bias_enable;
+        BitField<4, 1, u32> depth_clamp_disabled;
+        BitField<5, 1, u32> ndc_minus_one_to_one;
+        BitField<6, 2, u32> polygon_mode;
+        BitField<8, 5, u32> patch_control_points_minus_one;
+        BitField<13, 2, u32> tessellation_primitive;
+        BitField<15, 2, u32> tessellation_spacing;
+        BitField<17, 1, u32> tessellation_clockwise;
+        BitField<18, 1, u32> logic_op_enable;
+        BitField<19, 4, u32> logic_op;
+        BitField<23, 1, u32> rasterize_enable;
+    };
+    u32 point_size;
+    std::array<u32, Maxwell::NumVertexArrays> binding_divisors;
+    std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes;
+    std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments;
+    std::array<u16, Maxwell::NumViewports> viewport_swizzles;
+    DynamicState dynamic_state;
 
-    void Fill(const Maxwell& regs);
+    void Fill(const Maxwell& regs, bool has_extended_dynamic_state);
 
     std::size_t Hash() const noexcept;
 
@@ -255,6 +207,11 @@ struct FixedPipelineState {
     bool operator!=(const FixedPipelineState& rhs) const noexcept {
         return !operator==(rhs);
     }
+
+    std::size_t Size() const noexcept {
+        const std::size_t total_size = sizeof *this;
+        return total_size - (no_extended_dynamic_state != 0 ? 0 : sizeof(DynamicState));
+    }
 };
 static_assert(std::has_unique_object_representations_v<FixedPipelineState>);
 static_assert(std::is_trivially_copyable_v<FixedPipelineState>);
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 2871035f5..d7f1ae89f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -21,29 +21,29 @@ namespace Sampler {
 
 VkFilter Filter(Tegra::Texture::TextureFilter filter) {
     switch (filter) {
-    case Tegra::Texture::TextureFilter::Linear:
-        return VK_FILTER_LINEAR;
     case Tegra::Texture::TextureFilter::Nearest:
         return VK_FILTER_NEAREST;
+    case Tegra::Texture::TextureFilter::Linear:
+        return VK_FILTER_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter));
+    UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter));
     return {};
 }
 
 VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) {
     switch (mipmap_filter) {
     case Tegra::Texture::TextureMipmapFilter::None:
-        // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping
-        // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to
-        // use an image view with a single mipmap level to emulate this.
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
-        ;
-    case Tegra::Texture::TextureMipmapFilter::Linear:
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
+        // There are no Vulkan filter modes that directly correspond to OpenGL minification filters
+        // of GL_LINEAR or GL_NEAREST, but they can be emulated using
+        // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter =
+        // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively.
+        return VK_SAMPLER_MIPMAP_MODE_NEAREST;
     case Tegra::Texture::TextureMipmapFilter::Nearest:
         return VK_SAMPLER_MIPMAP_MODE_NEAREST;
+    case Tegra::Texture::TextureMipmapFilter::Linear:
+        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
+    UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
     return {};
 }
 
@@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+    return {};
 }
 
 VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -149,7 +148,7 @@ struct FormatTuple {
     {VK_FORMAT_R16_SFLOAT, Attachable | Storage},               // R16F
     {VK_FORMAT_R16_UNORM, Attachable | Storage},                // R16U
     {VK_FORMAT_UNDEFINED},                                      // R16S
-    {VK_FORMAT_UNDEFINED},                                      // R16UI
+    {VK_FORMAT_R16_UINT, Attachable | Storage},                 // R16UI
     {VK_FORMAT_UNDEFINED},                                      // R16I
     {VK_FORMAT_R16G16_UNORM, Attachable | Storage},             // RG16
     {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},            // RG16F
@@ -288,14 +287,35 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
         return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
     case Maxwell::PrimitiveTopology::Patches:
         return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+    return {};
 }
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
     switch (type) {
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return VK_FORMAT_R8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return VK_FORMAT_R8G8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return VK_FORMAT_R8G8B8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return VK_FORMAT_R8G8B8A8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        }
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -316,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
-        default:
-            break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_UNORM;
+            return VK_FORMAT_R8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_UNORM;
+            return VK_FORMAT_R8G8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_UNORM;
+            return VK_FORMAT_R8G8B8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_UNORM;
+            return VK_FORMAT_R8G8B8A8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_UNORM;
+            return VK_FORMAT_R16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_UNORM;
+            return VK_FORMAT_R16G16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_UNORM;
+            return VK_FORMAT_R16G16B16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_UNORM;
+            return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
-            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-        default:
-            break;
+            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SINT;
+            return VK_FORMAT_R8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SINT;
+            return VK_FORMAT_R8G8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SINT;
+            return VK_FORMAT_R8G8B8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SINT;
+            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SINT;
+            return VK_FORMAT_R16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SINT;
+            return VK_FORMAT_R16G16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SINT;
+            return VK_FORMAT_R16G16B16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32:
-            return VK_FORMAT_R32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32:
-            return VK_FORMAT_R32G32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32:
-            return VK_FORMAT_R32G32B32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
-            return VK_FORMAT_R32G32B32A32_SINT;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -400,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_UINT;
-        default:
-            break;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_USCALED;
+            return VK_FORMAT_R8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_USCALED;
+            return VK_FORMAT_R8G8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_USCALED;
+            return VK_FORMAT_R8G8B8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_USCALED;
+            return VK_FORMAT_R8G8B8A8_SINT;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_USCALED;
+            return VK_FORMAT_R16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_USCALED;
+            return VK_FORMAT_R16G16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_USCALED;
+            return VK_FORMAT_R16G16B16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_USCALED;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+            return VK_FORMAT_R32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+            return VK_FORMAT_R32G32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return VK_FORMAT_R32G32B32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return VK_FORMAT_R32G32B32A32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::Float:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SSCALED;
+            return VK_FORMAT_R16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SSCALED;
+            return VK_FORMAT_R16G16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SSCALED;
+            return VK_FORMAT_R16G16B16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SSCALED;
-        default:
-            break;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::Float:
-        switch (size) {
+            return VK_FORMAT_R16G16B16A16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -458,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SFLOAT;
-        default:
-            break;
         }
         break;
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 59b441943..2258479f5 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -13,6 +13,7 @@
 #include <fmt/format.h>
 
 #include "common/dynamic_library.h"
+#include "common/file_util.h"
 #include "common/logging/log.h"
 #include "common/telemetry.h"
 #include "core/core.h"
@@ -76,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() {
     char* libvulkan_env = getenv("LIBVULKAN_PATH");
     if (!libvulkan_env || !library.Open(libvulkan_env)) {
         // Use the libvulkan.dylib from the application bundle.
-        std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
+        const std::string filename =
+            FileUtil::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
         library.Open(filename.c_str());
     }
 #else
@@ -153,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
-    vk::Span<const char*> layers = layers_data;
-    if (!enable_layers) {
-        layers = {};
+    std::vector<const char*> layers;
+    layers.reserve(1);
+    if (enable_layers) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+
+    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+    if (!layer_properties) {
+        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+        layers.clear();
+    }
+
+    for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+        const char* const layer = *layer_it;
+        const auto it = std::find_if(
+            layer_properties->begin(), layer_properties->end(),
+            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+        if (it == layer_properties->end()) {
+            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+            layer_it = layers.erase(layer_it);
+        } else {
+            ++layer_it;
+        }
     }
+
     vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
@@ -387,7 +409,7 @@ bool RendererVulkan::PickDevices() {
         return false;
     }
 
-    const s32 device_index = Settings::values.vulkan_device;
+    const s32 device_index = Settings::values.vulkan_device.GetValue();
     if (device_index < 0 || device_index >= static_cast<s32>(devices->size())) {
         LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
         return false;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5f33d9e40..2be38d419 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                                     VAddr cpu_addr, std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
     VkBufferCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     ci.pNext = nullptr;
@@ -54,46 +54,17 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
     buffer.commit = memory_manager.Commit(buffer.handle, false);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return *empty.handle;
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                    const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
 
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -102,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
         barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
@@ -110,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
     });
 }
 
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                      u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
         barrier.pNext = nullptr;
@@ -123,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
         barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
 
@@ -131,18 +102,20 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
     });
     scheduler.Finish();
 
     std::memcpy(data, staging.commit->Map(size), size);
 }
 
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                              std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
-                      dst_offset, size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer dst_buffer = Handle();
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
+                      size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
         std::array<VkBufferMemoryBarrier, 2> barriers;
@@ -169,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
     });
 }
 
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                             const VKDevice& device, VKMemoryManager& memory_manager,
+                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
+                                                                 CreateStreamBuffer(device,
+                                                                                    scheduler)},
+      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
+                                                                                staging_pool} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+                                    size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+    size = std::max(size, std::size_t(4));
+    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, size, 0);
+    });
+    return {*empty.handle, 0, 0};
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index a54583e7d..991ee451c 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -8,7 +8,6 @@
 
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
@@ -24,22 +23,34 @@ class VKDevice;
 class VKMemoryManager;
 class VKScheduler;
 
-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                               VAddr cpu_addr, std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-    VkBuffer GetHandle() const {
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
+
+    void Download(std::size_t offset, std::size_t size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size);
+
+    VkBuffer Handle() const {
         return *buffer.handle;
     }
 
+    u64 Address() const {
+        return 0;
+    }
+
 private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
     VKBuffer buffer;
 };
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
     explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -47,21 +58,10 @@ public:
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
 
-    VkBuffer GetEmptyBuffer(std::size_t size) override;
+    BufferInfo GetEmptyBuffer(std::size_t size) override;
 
 protected:
-    VkBuffer ToHandle(const Buffer& buffer) override;
-
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
 private:
     const VKDevice& device;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 8e1b46277..281bf9ac3 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
     };
     add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
-    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
 
     VkDescriptorSetLayoutCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index 890fd52cf..9259b618d 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
         {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
         {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
         {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
         {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
 
     VkDescriptorPoolCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 750e5a0ca..fdaea4210 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -73,76 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
 
 std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
     vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
-    static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_UINT_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
-                                        VK_FORMAT_B5G6R5_UNORM_PACK16,
-                                        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
-                                        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
-                                        VK_FORMAT_R32G32B32A32_SFLOAT,
-                                        VK_FORMAT_R32G32B32A32_UINT,
-                                        VK_FORMAT_R32G32_SFLOAT,
-                                        VK_FORMAT_R32G32_UINT,
-                                        VK_FORMAT_R16G16B16A16_UINT,
-                                        VK_FORMAT_R16G16B16A16_SNORM,
-                                        VK_FORMAT_R16G16B16A16_UNORM,
-                                        VK_FORMAT_R16G16_UNORM,
-                                        VK_FORMAT_R16G16_SNORM,
-                                        VK_FORMAT_R16G16_SFLOAT,
-                                        VK_FORMAT_R16_UNORM,
-                                        VK_FORMAT_R8G8B8A8_SRGB,
-                                        VK_FORMAT_R8G8_UNORM,
-                                        VK_FORMAT_R8G8_SNORM,
-                                        VK_FORMAT_R8G8_UINT,
-                                        VK_FORMAT_R8_UNORM,
-                                        VK_FORMAT_R8_UINT,
-                                        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
-                                        VK_FORMAT_R32_SFLOAT,
-                                        VK_FORMAT_R32_UINT,
-                                        VK_FORMAT_R32_SINT,
-                                        VK_FORMAT_R16_SFLOAT,
-                                        VK_FORMAT_R16G16B16A16_SFLOAT,
-                                        VK_FORMAT_B8G8R8A8_UNORM,
-                                        VK_FORMAT_B8G8R8A8_SRGB,
-                                        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
-                                        VK_FORMAT_D32_SFLOAT,
-                                        VK_FORMAT_D16_UNORM,
-                                        VK_FORMAT_D16_UNORM_S8_UINT,
-                                        VK_FORMAT_D24_UNORM_S8_UINT,
-                                        VK_FORMAT_D32_SFLOAT_S8_UINT,
-                                        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
-                                        VK_FORMAT_BC2_UNORM_BLOCK,
-                                        VK_FORMAT_BC3_UNORM_BLOCK,
-                                        VK_FORMAT_BC4_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_SNORM_BLOCK,
-                                        VK_FORMAT_BC7_UNORM_BLOCK,
-                                        VK_FORMAT_BC6H_UFLOAT_BLOCK,
-                                        VK_FORMAT_BC6H_SFLOAT_BLOCK,
-                                        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
-                                        VK_FORMAT_BC2_SRGB_BLOCK,
-                                        VK_FORMAT_BC3_SRGB_BLOCK,
-                                        VK_FORMAT_BC7_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
-                                        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32};
+    static constexpr std::array formats{
+        VK_FORMAT_A8B8G8R8_UNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_UINT_PACK32,
+        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
+        VK_FORMAT_B5G6R5_UNORM_PACK16,
+        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        VK_FORMAT_R32G32B32A32_SFLOAT,
+        VK_FORMAT_R32G32B32A32_UINT,
+        VK_FORMAT_R32G32_SFLOAT,
+        VK_FORMAT_R32G32_UINT,
+        VK_FORMAT_R16G16B16A16_UINT,
+        VK_FORMAT_R16G16B16A16_SNORM,
+        VK_FORMAT_R16G16B16A16_UNORM,
+        VK_FORMAT_R16G16_UNORM,
+        VK_FORMAT_R16G16_SNORM,
+        VK_FORMAT_R16G16_SFLOAT,
+        VK_FORMAT_R16_UNORM,
+        VK_FORMAT_R16_UINT,
+        VK_FORMAT_R8G8B8A8_SRGB,
+        VK_FORMAT_R8G8_UNORM,
+        VK_FORMAT_R8G8_SNORM,
+        VK_FORMAT_R8G8_UINT,
+        VK_FORMAT_R8_UNORM,
+        VK_FORMAT_R8_UINT,
+        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
+        VK_FORMAT_R32_SFLOAT,
+        VK_FORMAT_R32_UINT,
+        VK_FORMAT_R32_SINT,
+        VK_FORMAT_R16_SFLOAT,
+        VK_FORMAT_R16G16B16A16_SFLOAT,
+        VK_FORMAT_B8G8R8A8_UNORM,
+        VK_FORMAT_B8G8R8A8_SRGB,
+        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
+        VK_FORMAT_D32_SFLOAT,
+        VK_FORMAT_D16_UNORM,
+        VK_FORMAT_D16_UNORM_S8_UINT,
+        VK_FORMAT_D24_UNORM_S8_UINT,
+        VK_FORMAT_D32_SFLOAT_S8_UINT,
+        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
+        VK_FORMAT_BC2_UNORM_BLOCK,
+        VK_FORMAT_BC3_UNORM_BLOCK,
+        VK_FORMAT_BC4_UNORM_BLOCK,
+        VK_FORMAT_BC5_UNORM_BLOCK,
+        VK_FORMAT_BC5_SNORM_BLOCK,
+        VK_FORMAT_BC7_UNORM_BLOCK,
+        VK_FORMAT_BC6H_UFLOAT_BLOCK,
+        VK_FORMAT_BC6H_SFLOAT_BLOCK,
+        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
+        VK_FORMAT_BC2_SRGB_BLOCK,
+        VK_FORMAT_BC3_SRGB_BLOCK,
+        VK_FORMAT_BC7_SRGB_BLOCK,
+        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
+        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
+        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
+    };
     std::unordered_map<VkFormat, VkFormatProperties> format_properties;
     for (const auto format : formats) {
         format_properties.emplace(format, physical.GetFormatProperties(format));
@@ -310,6 +313,16 @@ bool VKDevice::Create() {
         LOG_INFO(Render_Vulkan, "Device doesn't support custom border colors");
     }
 
+    VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state;
+    if (ext_extended_dynamic_state) {
+        dynamic_state.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT;
+        dynamic_state.pNext = nullptr;
+        dynamic_state.extendedDynamicState = VK_TRUE;
+        SetNext(next, dynamic_state);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support extended dynamic state");
+    }
+
     if (!ext_depth_range_unrestricted) {
         LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
     }
@@ -538,6 +551,7 @@ std::vector<const char*> VKDevice::LoadExtensions() {
     bool has_ext_subgroup_size_control{};
     bool has_ext_transform_feedback{};
     bool has_ext_custom_border_color{};
+    bool has_ext_extended_dynamic_state{};
     for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) {
         Test(extension, nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true);
         Test(extension, khr_uniform_buffer_standard_layout,
@@ -555,6 +569,8 @@ std::vector<const char*> VKDevice::LoadExtensions() {
              false);
         Test(extension, has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME,
              false);
+        Test(extension, has_ext_extended_dynamic_state,
+             VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false);
         if (Settings::values.renderer_debug) {
             Test(extension, nv_device_diagnostics_config,
                  VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, true);
@@ -640,6 +656,19 @@ std::vector<const char*> VKDevice::LoadExtensions() {
         }
     }
 
+    if (has_ext_extended_dynamic_state) {
+        VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state;
+        dynamic_state.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT;
+        dynamic_state.pNext = nullptr;
+        features.pNext = &dynamic_state;
+        physical.GetFeatures2KHR(features);
+
+        if (dynamic_state.extendedDynamicState) {
+            extensions.push_back(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
+            ext_extended_dynamic_state = true;
+        }
+    }
+
     return extensions;
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 6b9227b09..ae5c21baa 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -182,6 +182,11 @@ public:
         return ext_custom_border_color;
     }
 
+    /// Returns true if the device supports VK_EXT_extended_dynamic_state.
+    bool IsExtExtendedDynamicStateSupported() const {
+        return ext_extended_dynamic_state;
+    }
+
     /// Returns the vendor name reported from Vulkan.
     std::string_view GetVendorName() const {
         return vendor_name;
@@ -239,6 +244,7 @@ private:
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool ext_transform_feedback{};             ///< Support for VK_EXT_transform_feedback.
     bool ext_custom_border_color{};            ///< Support for VK_EXT_custom_border_color.
+    bool ext_extended_dynamic_state{};         ///< Support for VK_EXT_extended_dynamic_state.
     bool nv_device_diagnostics_config{};       ///< Support for VK_NV_device_diagnostics_config.
 
     // Telemetry parameters
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 69b6bba00..844445105 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -176,20 +176,32 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
 
 vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpass_params,
                                                 const SPIRVProgram& program) const {
-    const auto& vi = fixed_state.vertex_input;
-    const auto& ds = fixed_state.depth_stencil;
-    const auto& cd = fixed_state.color_blending;
-    const auto& rs = fixed_state.rasterizer;
-    const auto& viewport_swizzles = fixed_state.viewport_swizzles.swizzles;
+    const auto& state = fixed_state;
+    const auto& viewport_swizzles = state.viewport_swizzles;
+
+    FixedPipelineState::DynamicState dynamic;
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        // Insert dummy values, as long as they are valid they don't matter as extended dynamic
+        // state is ignored
+        dynamic.raw1 = 0;
+        dynamic.raw2 = 0;
+        for (FixedPipelineState::VertexBinding& binding : dynamic.vertex_bindings) {
+            // Enable all vertex bindings
+            binding.raw = 0;
+            binding.enabled.Assign(1);
+        }
+    } else {
+        dynamic = state.dynamic_state;
+    }
 
     std::vector<VkVertexInputBindingDescription> vertex_bindings;
     std::vector<VkVertexInputBindingDivisorDescriptionEXT> vertex_binding_divisors;
-    for (std::size_t index = 0; index < std::size(vi.bindings); ++index) {
-        const auto& binding = vi.bindings[index];
+    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        const auto& binding = dynamic.vertex_bindings[index];
         if (!binding.enabled) {
             continue;
         }
-        const bool instanced = vi.binding_divisors[index] != 0;
+        const bool instanced = state.binding_divisors[index] != 0;
         const auto rate = instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
 
         auto& vertex_binding = vertex_bindings.emplace_back();
@@ -200,14 +212,14 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         if (instanced) {
             auto& binding_divisor = vertex_binding_divisors.emplace_back();
             binding_divisor.binding = static_cast<u32>(index);
-            binding_divisor.divisor = vi.binding_divisors[index];
+            binding_divisor.divisor = state.binding_divisors[index];
         }
     }
 
     std::vector<VkVertexInputAttributeDescription> vertex_attributes;
     const auto& input_attributes = program[0]->entries.attributes;
-    for (std::size_t index = 0; index < std::size(vi.attributes); ++index) {
-        const auto& attribute = vi.attributes[index];
+    for (std::size_t index = 0; index < state.attributes.size(); ++index) {
+        const auto& attribute = state.attributes[index];
         if (!attribute.enabled) {
             continue;
         }
@@ -244,15 +256,15 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
     input_assembly_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
     input_assembly_ci.pNext = nullptr;
     input_assembly_ci.flags = 0;
-    input_assembly_ci.topology = MaxwellToVK::PrimitiveTopology(device, rs.Topology());
+    input_assembly_ci.topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology());
     input_assembly_ci.primitiveRestartEnable =
-        rs.primitive_restart_enable != 0 && SupportsPrimitiveRestart(input_assembly_ci.topology);
+        state.primitive_restart_enable != 0 && SupportsPrimitiveRestart(input_assembly_ci.topology);
 
     VkPipelineTessellationStateCreateInfo tessellation_ci;
     tessellation_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO;
     tessellation_ci.pNext = nullptr;
     tessellation_ci.flags = 0;
-    tessellation_ci.patchControlPoints = rs.patch_control_points_minus_one.Value() + 1;
+    tessellation_ci.patchControlPoints = state.patch_control_points_minus_one.Value() + 1;
 
     VkPipelineViewportStateCreateInfo viewport_ci;
     viewport_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
@@ -280,13 +292,13 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
     rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
     rasterization_ci.pNext = nullptr;
     rasterization_ci.flags = 0;
-    rasterization_ci.depthClampEnable = rs.depth_clamp_disabled == 0 ? VK_TRUE : VK_FALSE;
-    rasterization_ci.rasterizerDiscardEnable = rs.rasterize_enable == 0 ? VK_TRUE : VK_FALSE;
+    rasterization_ci.depthClampEnable = state.depth_clamp_disabled == 0 ? VK_TRUE : VK_FALSE;
+    rasterization_ci.rasterizerDiscardEnable = state.rasterize_enable == 0 ? VK_TRUE : VK_FALSE;
     rasterization_ci.polygonMode = VK_POLYGON_MODE_FILL;
     rasterization_ci.cullMode =
-        rs.cull_enable ? MaxwellToVK::CullFace(rs.CullFace()) : VK_CULL_MODE_NONE;
-    rasterization_ci.frontFace = MaxwellToVK::FrontFace(rs.FrontFace());
-    rasterization_ci.depthBiasEnable = rs.depth_bias_enable;
+        dynamic.cull_enable ? MaxwellToVK::CullFace(dynamic.CullFace()) : VK_CULL_MODE_NONE;
+    rasterization_ci.frontFace = MaxwellToVK::FrontFace(dynamic.FrontFace());
+    rasterization_ci.depthBiasEnable = state.depth_bias_enable;
     rasterization_ci.depthBiasConstantFactor = 0.0f;
     rasterization_ci.depthBiasClamp = 0.0f;
     rasterization_ci.depthBiasSlopeFactor = 0.0f;
@@ -307,14 +319,15 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
     depth_stencil_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
     depth_stencil_ci.pNext = nullptr;
     depth_stencil_ci.flags = 0;
-    depth_stencil_ci.depthTestEnable = ds.depth_test_enable;
-    depth_stencil_ci.depthWriteEnable = ds.depth_write_enable;
-    depth_stencil_ci.depthCompareOp =
-        ds.depth_test_enable ? MaxwellToVK::ComparisonOp(ds.DepthTestFunc()) : VK_COMPARE_OP_ALWAYS;
-    depth_stencil_ci.depthBoundsTestEnable = ds.depth_bounds_enable;
-    depth_stencil_ci.stencilTestEnable = ds.stencil_enable;
-    depth_stencil_ci.front = GetStencilFaceState(ds.front);
-    depth_stencil_ci.back = GetStencilFaceState(ds.back);
+    depth_stencil_ci.depthTestEnable = dynamic.depth_test_enable;
+    depth_stencil_ci.depthWriteEnable = dynamic.depth_write_enable;
+    depth_stencil_ci.depthCompareOp = dynamic.depth_test_enable
+                                          ? MaxwellToVK::ComparisonOp(dynamic.DepthTestFunc())
+                                          : VK_COMPARE_OP_ALWAYS;
+    depth_stencil_ci.depthBoundsTestEnable = dynamic.depth_bounds_enable;
+    depth_stencil_ci.stencilTestEnable = dynamic.stencil_enable;
+    depth_stencil_ci.front = GetStencilFaceState(dynamic.front);
+    depth_stencil_ci.back = GetStencilFaceState(dynamic.back);
     depth_stencil_ci.minDepthBounds = 0.0f;
     depth_stencil_ci.maxDepthBounds = 0.0f;
 
@@ -324,7 +337,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
         static constexpr std::array COMPONENT_TABLE = {
             VK_COLOR_COMPONENT_R_BIT, VK_COLOR_COMPONENT_G_BIT, VK_COLOR_COMPONENT_B_BIT,
             VK_COLOR_COMPONENT_A_BIT};
-        const auto& blend = cd.attachments[index];
+        const auto& blend = state.attachments[index];
 
         VkColorComponentFlags color_components = 0;
         for (std::size_t i = 0; i < COMPONENT_TABLE.size(); ++i) {
@@ -354,11 +367,27 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
     color_blend_ci.pAttachments = cb_attachments.data();
     std::memset(color_blend_ci.blendConstants, 0, sizeof(color_blend_ci.blendConstants));
 
-    static constexpr std::array dynamic_states = {
+    std::vector dynamic_states = {
         VK_DYNAMIC_STATE_VIEWPORT,           VK_DYNAMIC_STATE_SCISSOR,
         VK_DYNAMIC_STATE_DEPTH_BIAS,         VK_DYNAMIC_STATE_BLEND_CONSTANTS,
         VK_DYNAMIC_STATE_DEPTH_BOUNDS,       VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
-        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE};
+        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+    };
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        static constexpr std::array extended = {
+            VK_DYNAMIC_STATE_CULL_MODE_EXT,
+            VK_DYNAMIC_STATE_FRONT_FACE_EXT,
+            VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT,
+            VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
+            VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT,
+            VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
+            VK_DYNAMIC_STATE_STENCIL_OP_EXT,
+        };
+        dynamic_states.insert(dynamic_states.end(), extended.begin(), extended.end());
+    }
 
     VkPipelineDynamicStateCreateInfo dynamic_state_ci;
     dynamic_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index a5c7b7945..3da835324 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -27,6 +27,7 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/shader/compiler_settings.h"
 #include "video_core/shader/memory_util.h"
+#include "video_core/shader_cache.h"
 
 namespace Vulkan {
 
@@ -45,6 +46,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
 constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
 constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
 
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
@@ -104,8 +106,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
     u32 binding = base_binding;
     AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
     AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
-    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
     AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
+    AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
     AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
     return binding;
 }
@@ -113,12 +116,12 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
 } // Anonymous namespace
 
 std::size_t GraphicsPipelineCacheKey::Hash() const noexcept {
-    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this);
+    const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size());
     return static_cast<std::size_t>(hash);
 }
 
 bool GraphicsPipelineCacheKey::operator==(const GraphicsPipelineCacheKey& rhs) const noexcept {
-    return std::memcmp(&rhs, this, sizeof *this) == 0;
+    return std::memcmp(&rhs, this, Size()) == 0;
 }
 
 std::size_t ComputePipelineCacheKey::Hash() const noexcept {
@@ -130,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con
     return std::memcmp(&rhs, this, sizeof *this) == 0;
 }
 
-CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage,
-                           GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code,
-                           u32 main_offset)
-    : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)},
+Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+               VideoCommon::Shader::ProgramCode program_code, u32 main_offset)
+    : gpu_addr{gpu_addr}, program_code{std::move(program_code)},
       registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
                                                            compiler_settings, registry},
       entries{GenerateShaderEntries(shader_ir)} {}
 
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
 
-Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine(
-    Core::System& system, Tegra::Engines::ShaderType stage) {
-    if (stage == Tegra::Engines::ShaderType::Compute) {
+Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system,
+                                                              Tegra::Engines::ShaderType stage) {
+    if (stage == ShaderType::Compute) {
         return system.GPU().KeplerCompute();
     } else {
         return system.GPU().Maxwell3D();
@@ -154,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri
                                  VKDescriptorPool& descriptor_pool,
                                  VKUpdateDescriptorQueue& update_descriptor_queue,
                                  VKRenderPassCache& renderpass_cache)
-    : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler},
-      descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue},
-      renderpass_cache{renderpass_cache} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device},
+      scheduler{scheduler}, descriptor_pool{descriptor_pool},
+      update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}
 
 VKPipelineCache::~VKPipelineCache() = default;
 
-std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
+std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
     const auto& gpu = system.GPU().Maxwell3D();
 
-    std::array<Shader, Maxwell::MaxShaderProgram> shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto program{static_cast<Maxwell::ShaderProgram>(index)};
 
@@ -176,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
         const GPUVAddr program_addr{GetShaderAddress(system, program)};
         const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
         ASSERT(cpu_addr);
-        auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
-        if (!shader) {
+
+        Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
+        if (!result) {
             const auto host_ptr{memory_manager.GetPointer(program_addr)};
 
             // No shader found - create a new one
             constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
-            const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1);
+            const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
             ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
+            const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+            auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code),
+                                                   stage_offset);
+            result = shader.get();
 
-            shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr,
-                                                    std::move(code), stage_offset);
             if (cpu_addr) {
-                Register(shader);
+                Register(std::move(shader), *cpu_addr, size_in_bytes);
             } else {
-                null_shader = shader;
+                null_shader = std::move(shader);
             }
         }
-        shaders[index] = std::move(shader);
+        shaders[index] = result;
     }
     return last_shaders = shaders;
 }
@@ -234,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
     ASSERT(cpu_addr);
 
-    auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
+    Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
     if (!shader) {
         // No shader found - create a new one
         const auto host_ptr = memory_manager.GetPointer(program_addr);
 
         ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
-        shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute,
-                                                program_addr, *cpu_addr, std::move(code),
-                                                KERNEL_MAIN_OFFSET);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+        auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr,
+                                                    std::move(code), KERNEL_MAIN_OFFSET);
+        shader = shader_info.get();
+
         if (cpu_addr) {
-            Register(shader);
+            Register(std::move(shader_info), *cpu_addr, size_in_bytes);
         } else {
-            null_kernel = shader;
+            null_kernel = std::move(shader_info);
         }
     }
 
@@ -262,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     return *entry;
 }
 
-void VKPipelineCache::Unregister(const Shader& shader) {
+void VKPipelineCache::OnShaderRemoval(Shader* shader) {
     bool finished = false;
     const auto Finish = [&] {
         // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and
@@ -294,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) {
         Finish();
         it = compute_cache.erase(it);
     }
-
-    RasterizerCache::Unregister(shader);
 }
 
 std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
@@ -305,16 +312,19 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
     const auto& gpu = system.GPU().Maxwell3D();
 
     Specialization specialization;
-    if (fixed_state.rasterizer.Topology() == Maxwell::PrimitiveTopology::Points) {
+    if (fixed_state.dynamic_state.Topology() == Maxwell::PrimitiveTopology::Points ||
+        device.IsExtExtendedDynamicStateSupported()) {
         float point_size;
-        std::memcpy(&point_size, &fixed_state.rasterizer.point_size, sizeof(float));
+        std::memcpy(&point_size, &fixed_state.point_size, sizeof(float));
         specialization.point_size = point_size;
         ASSERT(point_size != 0.0f);
     }
     for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
-        specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type();
+        const auto& attribute = fixed_state.attributes[i];
+        specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
+        specialization.attribute_types[i] = attribute.Type();
     }
-    specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
+    specialization.ndc_minus_one_to_one = fixed_state.ndc_minus_one_to_one;
 
     SPIRVProgram program;
     std::vector<VkDescriptorSetLayoutBinding> bindings;
@@ -328,12 +338,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
         }
 
         const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
-        ASSERT(shader);
+        const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
 
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
-        const auto program_type = GetShaderType(program_enum);
+        const ShaderType program_type = GetShaderType(program_enum);
         const auto& entries = shader->GetEntries();
         program[stage] = {
             Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
@@ -375,16 +384,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
         return;
     }
 
-    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
-        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-        // crash.
+    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
+                  descriptor_type == STORAGE_TEXEL_BUFFER) {
+        // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
+        // Note: Fixed in driver Windows 443.24, Linux 440.66.15
         for (u32 i = 0; i < count; ++i) {
             VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
             entry.dstBinding = binding + i;
             entry.dstArrayElement = 0;
             entry.descriptorCount = 1;
             entry.descriptorType = descriptor_type;
-            entry.offset = offset + i * entry_size;
+            entry.offset = static_cast<std::size_t>(offset + i * entry_size);
             entry.stride = entry_size;
         }
     } else if (count > 0) {
@@ -405,8 +415,9 @@ void FillDescriptorUpdateTemplateEntries(
     std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
     AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
     AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
-    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
     AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
+    AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
     AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 0b5796fef..0a3fe65fb 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -17,7 +17,6 @@
 #include "common/common_types.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
@@ -26,6 +25,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -41,15 +41,13 @@ class VKFence;
 class VKScheduler;
 class VKUpdateDescriptorQueue;
 
-class CachedShader;
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct GraphicsPipelineCacheKey {
-    FixedPipelineState fixed_state;
     RenderPassParams renderpass_params;
+    u32 padding;
     std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders;
-    u64 padding; // This is necessary for unique object representations
+    FixedPipelineState fixed_state;
 
     std::size_t Hash() const noexcept;
 
@@ -58,6 +56,10 @@ struct GraphicsPipelineCacheKey {
     bool operator!=(const GraphicsPipelineCacheKey& rhs) const noexcept {
         return !operator==(rhs);
     }
+
+    std::size_t Size() const noexcept {
+        return sizeof(renderpass_params) + sizeof(padding) + sizeof(shaders) + fixed_state.Size();
+    }
 };
 static_assert(std::has_unique_object_representations_v<GraphicsPipelineCacheKey>);
 static_assert(std::is_trivially_copyable_v<GraphicsPipelineCacheKey>);
@@ -102,21 +104,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
 
 namespace Vulkan {
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader {
 public:
-    explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
-                          VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code,
-                          u32 main_offset);
-    ~CachedShader();
+    explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+                    VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
+    ~Shader();
 
     GPUVAddr GetGpuAddr() const {
         return gpu_addr;
     }
 
-    std::size_t GetSizeInBytes() const override {
-        return program_code.size() * sizeof(u64);
-    }
-
     VideoCommon::Shader::ShaderIR& GetIR() {
         return shader_ir;
     }
@@ -144,25 +141,23 @@ private:
     ShaderEntries entries;
 };
 
-class VKPipelineCache final : public RasterizerCache<Shader> {
+class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
 public:
     explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
                              const VKDevice& device, VKScheduler& scheduler,
                              VKDescriptorPool& descriptor_pool,
                              VKUpdateDescriptorQueue& update_descriptor_queue,
                              VKRenderPassCache& renderpass_cache);
-    ~VKPipelineCache();
+    ~VKPipelineCache() override;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> GetShaders();
+    std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();
 
     VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);
 
     VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);
 
 protected:
-    void Unregister(const Shader& shader) override;
-
-    void FlushObjectInner(const Shader& object) override {}
+    void OnShaderRemoval(Shader* shader) final;
 
 private:
     std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
@@ -175,10 +170,10 @@ private:
     VKUpdateDescriptorQueue& update_descriptor_queue;
     VKRenderPassCache& renderpass_cache;
 
-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 
     GraphicsPipelineCacheKey last_graphics_key;
     VKGraphicsPipeline* last_graphics_pipeline = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index be5b77fae..380ed532b 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -38,6 +38,7 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader_cache.h"
 
 namespace Vulkan {
 
@@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {
 }
 
 std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;
     for (std::size_t i = 0; i < std::size(addresses); ++i) {
         addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0;
@@ -117,6 +118,17 @@ template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                std::size_t stage, std::size_t index = 0) {
     const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
         const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
         return engine.GetTextureInfo(tex_handle);
@@ -131,13 +143,65 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
     }
 }
 
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+    if (!is_clear) {
+        return true;
+    }
+    // First we have to make sure all clear masks are enabled.
+    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+        !regs.clear_buffers.A) {
+        return true;
+    }
+    // If scissors are disabled, the whole screen is cleared
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Then we have to confirm scissor testing clears the whole image
+    const std::size_t index = regs.clear_buffers.RT;
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+           scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+    // If we are not clearing, the contents have to be preserved
+    if (!is_clear) {
+        return true;
+    }
+    // For depth stencil clears we only have to confirm scissor test covers the whole image
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Make sure the clear cover the whole image
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+           scissor.max_y < regs.zeta_height;
+}
+
+template <std::size_t N>
+std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) {
+    std::array<VkDeviceSize, N> expanded;
+    std::copy(strides.begin(), strides.end(), expanded.begin());
+    return expanded;
+}
+
 } // Anonymous namespace
 
 class BufferBindings final {
 public:
-    void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset) {
+    void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) {
         vertex.buffers[vertex.num_buffers] = buffer;
         vertex.offsets[vertex.num_buffers] = offset;
+        vertex.sizes[vertex.num_buffers] = size;
+        vertex.strides[vertex.num_buffers] = static_cast<u16>(stride);
         ++vertex.num_buffers;
     }
 
@@ -147,76 +211,76 @@ public:
         index.type = type;
     }
 
-    void Bind(VKScheduler& scheduler) const {
+    void Bind(const VKDevice& device, VKScheduler& scheduler) const {
         // Use this large switch case to avoid dispatching more memory in the record lambda than
         // what we need. It looks horrible, but it's the best we can do on standard C++.
         switch (vertex.num_buffers) {
         case 0:
-            return BindStatic<0>(scheduler);
+            return BindStatic<0>(device, scheduler);
         case 1:
-            return BindStatic<1>(scheduler);
+            return BindStatic<1>(device, scheduler);
         case 2:
-            return BindStatic<2>(scheduler);
+            return BindStatic<2>(device, scheduler);
         case 3:
-            return BindStatic<3>(scheduler);
+            return BindStatic<3>(device, scheduler);
         case 4:
-            return BindStatic<4>(scheduler);
+            return BindStatic<4>(device, scheduler);
         case 5:
-            return BindStatic<5>(scheduler);
+            return BindStatic<5>(device, scheduler);
         case 6:
-            return BindStatic<6>(scheduler);
+            return BindStatic<6>(device, scheduler);
         case 7:
-            return BindStatic<7>(scheduler);
+            return BindStatic<7>(device, scheduler);
         case 8:
-            return BindStatic<8>(scheduler);
+            return BindStatic<8>(device, scheduler);
         case 9:
-            return BindStatic<9>(scheduler);
+            return BindStatic<9>(device, scheduler);
         case 10:
-            return BindStatic<10>(scheduler);
+            return BindStatic<10>(device, scheduler);
         case 11:
-            return BindStatic<11>(scheduler);
+            return BindStatic<11>(device, scheduler);
         case 12:
-            return BindStatic<12>(scheduler);
+            return BindStatic<12>(device, scheduler);
         case 13:
-            return BindStatic<13>(scheduler);
+            return BindStatic<13>(device, scheduler);
         case 14:
-            return BindStatic<14>(scheduler);
+            return BindStatic<14>(device, scheduler);
         case 15:
-            return BindStatic<15>(scheduler);
+            return BindStatic<15>(device, scheduler);
         case 16:
-            return BindStatic<16>(scheduler);
+            return BindStatic<16>(device, scheduler);
         case 17:
-            return BindStatic<17>(scheduler);
+            return BindStatic<17>(device, scheduler);
         case 18:
-            return BindStatic<18>(scheduler);
+            return BindStatic<18>(device, scheduler);
         case 19:
-            return BindStatic<19>(scheduler);
+            return BindStatic<19>(device, scheduler);
         case 20:
-            return BindStatic<20>(scheduler);
+            return BindStatic<20>(device, scheduler);
         case 21:
-            return BindStatic<21>(scheduler);
+            return BindStatic<21>(device, scheduler);
         case 22:
-            return BindStatic<22>(scheduler);
+            return BindStatic<22>(device, scheduler);
         case 23:
-            return BindStatic<23>(scheduler);
+            return BindStatic<23>(device, scheduler);
         case 24:
-            return BindStatic<24>(scheduler);
+            return BindStatic<24>(device, scheduler);
         case 25:
-            return BindStatic<25>(scheduler);
+            return BindStatic<25>(device, scheduler);
         case 26:
-            return BindStatic<26>(scheduler);
+            return BindStatic<26>(device, scheduler);
         case 27:
-            return BindStatic<27>(scheduler);
+            return BindStatic<27>(device, scheduler);
         case 28:
-            return BindStatic<28>(scheduler);
+            return BindStatic<28>(device, scheduler);
         case 29:
-            return BindStatic<29>(scheduler);
+            return BindStatic<29>(device, scheduler);
         case 30:
-            return BindStatic<30>(scheduler);
+            return BindStatic<30>(device, scheduler);
         case 31:
-            return BindStatic<31>(scheduler);
+            return BindStatic<31>(device, scheduler);
         case 32:
-            return BindStatic<32>(scheduler);
+            return BindStatic<32>(device, scheduler);
         }
         UNREACHABLE();
     }
@@ -227,6 +291,8 @@ private:
         std::size_t num_buffers = 0;
         std::array<VkBuffer, Maxwell::NumVertexArrays> buffers;
         std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets;
+        std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes;
+        std::array<u16, Maxwell::NumVertexArrays> strides;
     } vertex;
 
     struct {
@@ -236,15 +302,23 @@ private:
     } index;
 
     template <std::size_t N>
-    void BindStatic(VKScheduler& scheduler) const {
-        if (index.buffer) {
-            BindStatic<N, true>(scheduler);
+    void BindStatic(const VKDevice& device, VKScheduler& scheduler) const {
+        if (device.IsExtExtendedDynamicStateSupported()) {
+            if (index.buffer) {
+                BindStatic<N, true, true>(scheduler);
+            } else {
+                BindStatic<N, false, true>(scheduler);
+            }
         } else {
-            BindStatic<N, false>(scheduler);
+            if (index.buffer) {
+                BindStatic<N, true, false>(scheduler);
+            } else {
+                BindStatic<N, false, false>(scheduler);
+            }
         }
     }
 
-    template <std::size_t N, bool is_indexed>
+    template <std::size_t N, bool is_indexed, bool has_extended_dynamic_state>
     void BindStatic(VKScheduler& scheduler) const {
         static_assert(N <= Maxwell::NumVertexArrays);
         if constexpr (N == 0) {
@@ -256,6 +330,31 @@ private:
         std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin());
         std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin());
 
+        if constexpr (has_extended_dynamic_state) {
+            // With extended dynamic states we can specify the length and stride of a vertex buffer
+            // std::array<VkDeviceSize, N> sizes;
+            std::array<u16, N> strides;
+            // std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin());
+            std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin());
+
+            if constexpr (is_indexed) {
+                scheduler.Record(
+                    [buffers, offsets, strides, index = index](vk::CommandBuffer cmdbuf) {
+                        cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
+                        cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
+                                                     offsets.data(), nullptr,
+                                                     ExpandStrides(strides).data());
+                    });
+            } else {
+                scheduler.Record([buffers, offsets, strides](vk::CommandBuffer cmdbuf) {
+                    cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
+                                                 offsets.data(), nullptr,
+                                                 ExpandStrides(strides).data());
+                });
+            }
+            return;
+        }
+
         if constexpr (is_indexed) {
             // Indexed draw
             scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) {
@@ -314,7 +413,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     const auto& gpu = system.GPU().Maxwell3D();
     GraphicsPipelineCacheKey key;
-    key.fixed_state.Fill(gpu.regs);
+    key.fixed_state.Fill(gpu.regs, device.IsExtExtendedDynamicStateSupported());
 
     buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed));
 
@@ -332,7 +431,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     buffer_cache.Unmap();
 
-    const Texceptions texceptions = UpdateAttachments();
+    const Texceptions texceptions = UpdateAttachments(false);
     SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
 
     key.renderpass_params = GetRenderPassParams(texceptions);
@@ -347,7 +446,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     UpdateDynamicStates();
 
-    buffer_bindings.Bind(scheduler);
+    buffer_bindings.Bind(device, scheduler);
 
     BeginTransformFeedback();
 
@@ -388,7 +487,7 @@ void RasterizerVulkan::Clear() {
         return;
     }
 
-    [[maybe_unused]] const auto texceptions = UpdateAttachments();
+    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
     DEBUG_ASSERT(texceptions.none());
     SetupImageTransitions(0, color_attachments, zeta_attachment);
 
@@ -468,8 +567,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     const auto& entries = pipeline.GetEntries();
     SetupComputeConstBuffers(entries);
     SetupComputeGlobalBuffers(entries);
-    SetupComputeTexelBuffers(entries);
+    SetupComputeUniformTexels(entries);
     SetupComputeTextures(entries);
+    SetupComputeStorageTexels(entries);
     SetupComputeImages(entries);
 
     buffer_cache.Unmap();
@@ -664,9 +764,12 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    auto& dirty = maxwell3d.dirty.flags;
+    auto& regs = maxwell3d.regs;
+
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -675,7 +778,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     Texceptions texceptions;
     for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
         if (update_rendertargets) {
-            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
             texceptions[rt] = true;
@@ -683,7 +787,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     }
 
     if (update_rendertargets) {
-        zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
         texceptions[ZETA_TEXCEPTION_INDEX] = true;
@@ -715,7 +820,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
         if (!view) {
             return false;
         }
-        key.views.push_back(view->GetHandle());
+        key.views.push_back(view->GetAttachment());
         key.width = std::min(key.width, view->GetWidth());
         key.height = std::min(key.height, view->GetHeight());
         key.layers = std::min(key.layers, view->GetNumLayers());
@@ -761,7 +866,7 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
     const auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    SetupVertexArrays(fixed_state.vertex_input, buffer_bindings);
+    SetupVertexArrays(buffer_bindings);
 
     const u32 base_instance = regs.vb_base_instance;
     const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1;
@@ -775,20 +880,21 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
 }
 
 void RasterizerVulkan::SetupShaderDescriptors(
-    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
     texture_cache.GuardSamplers(true);
 
     for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
         // Skip VertexA stage
-        const auto& shader = shaders[stage + 1];
+        Shader* const shader = shaders[stage + 1];
         if (!shader) {
             continue;
         }
         const auto& entries = shader->GetEntries();
         SetupGraphicsConstBuffers(entries, stage);
         SetupGraphicsGlobalBuffers(entries, stage);
-        SetupGraphicsTexelBuffers(entries, stage);
+        SetupGraphicsUniformTexels(entries, stage);
         SetupGraphicsTextures(entries, stage);
+        SetupGraphicsStorageTexels(entries, stage);
         SetupGraphicsImages(entries, stage);
     }
     texture_cache.GuardSamplers(false);
@@ -831,6 +937,17 @@ void RasterizerVulkan::UpdateDynamicStates() {
     UpdateBlendConstants(regs);
     UpdateDepthBounds(regs);
     UpdateStencilFaces(regs);
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        UpdateCullMode(regs);
+        UpdateDepthBoundsTestEnable(regs);
+        UpdateDepthTestEnable(regs);
+        UpdateDepthWriteEnable(regs);
+        UpdateDepthCompareOp(regs);
+        UpdateFrontFace(regs);
+        UpdatePrimitiveTopology(regs);
+        UpdateStencilOp(regs);
+        UpdateStencilTestEnable(regs);
+    }
 }
 
 void RasterizerVulkan::BeginTransformFeedback() {
@@ -838,6 +955,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
+        return;
+    }
 
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -852,10 +973,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     UNIMPLEMENTED_IF(binding.buffer_offset != 0);
 
     const GPUVAddr gpu_addr = binding.Address();
-    const std::size_t size = binding.buffer_size;
-    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
 
-    scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
         cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
         cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
     });
@@ -866,50 +987,33 @@ void RasterizerVulkan::EndTransformFeedback() {
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        return;
+    }
 
     scheduler.Record(
         [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
 }
 
-void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
-                                         BufferBindings& buffer_bindings) {
+void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
     const auto& regs = system.GPU().Maxwell3D().regs;
 
-    for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
-        const auto& attrib = regs.vertex_attrib_format[index];
-        if (!attrib.IsValid()) {
-            vertex_input.SetAttribute(index, false, 0, 0, {}, {});
-            continue;
-        }
-
-        [[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
-        ASSERT(buffer.IsEnabled());
-
-        vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
-                                  attrib.size.Value());
-    }
-
     for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
         const auto& vertex_array = regs.vertex_array[index];
         if (!vertex_array.IsEnabled()) {
-            vertex_input.SetBinding(index, false, 0, 0);
             continue;
         }
-        vertex_input.SetBinding(
-            index, true, vertex_array.stride,
-            regs.instanced_arrays.IsInstancingEnabled(index) ? vertex_array.divisor : 0);
-
         const GPUVAddr start{vertex_array.StartAddress()};
         const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
 
         ASSERT(end >= start);
-        const std::size_t size{end - start};
+        const std::size_t size = end - start;
         if (size == 0) {
-            buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
+            buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0);
             continue;
         }
-        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
-        buffer_bindings.AddVertexBinding(buffer, offset);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride);
     }
 }
 
@@ -931,7 +1035,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
         std::tie(buffer, offset) = quad_indexed_pass.Assemble(
             regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
 
@@ -945,7 +1051,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
 
         auto format = regs.index_array.format;
         const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
@@ -980,12 +1088,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
     }
 }
 
-void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
+void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().Maxwell3D();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
         const auto image = GetTextureInfo(gpu, entry, stage).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
     }
 }
 
@@ -1000,6 +1108,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
     }
 }
 
+void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().Maxwell3D();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, stage).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Images);
     const auto& gpu = system.GPU().Maxwell3D();
@@ -1032,12 +1149,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
     }
 }
 
-void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
+void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().KeplerCompute();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
         const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
     }
 }
 
@@ -1052,6 +1169,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
     }
 }
 
+void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().KeplerCompute();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Images);
     const auto& gpu = system.GPU().KeplerCompute();
@@ -1074,10 +1200,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
         Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
     ASSERT(size <= MaxConstbufferSize);
 
-    const auto [buffer_handle, offset] =
+    const auto info =
         buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
-    update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
@@ -1091,18 +1216,18 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
         // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
         // default buffer.
         static constexpr std::size_t dummy_size = 4;
-        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
-        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+        const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
         return;
     }
 
-    const auto [buffer, offset] = buffer_cache.UploadMemory(
+    const auto info = buffer_cache.UploadMemory(
         actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
-    update_descriptor_queue.AddBuffer(buffer, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
-void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
-                                        const TexelBufferEntry& entry) {
+void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
+                                          const UniformTexelEntry& entry) {
     const auto view = texture_cache.GetTextureSurface(tic, entry);
     ASSERT(view->IsBufferView());
 
@@ -1114,16 +1239,24 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
     auto view = texture_cache.GetTextureSurface(texture.tic, entry);
     ASSERT(!view->IsBufferView());
 
-    const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source,
-                                            texture.tic.z_source, texture.tic.w_source);
+    const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
+                                                      texture.tic.z_source, texture.tic.w_source);
     const auto sampler = sampler_cache.GetSampler(texture.tsc);
     update_descriptor_queue.AddSampledImage(sampler, image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     sampled_views.push_back(ImageView{std::move(view), image_layout});
 }
 
+void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
+                                         const StorageTexelEntry& entry) {
+    const auto view = texture_cache.GetImageSurface(tic, entry);
+    ASSERT(view->IsBufferView());
+
+    update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
+}
+
 void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
     auto view = texture_cache.GetImageSurface(tic, entry);
 
@@ -1133,10 +1266,11 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
 
     UNIMPLEMENTED_IF(tic.IsBuffer());
 
-    const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    const VkImageView image_view =
+        view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
     update_descriptor_queue.AddImage(image_view);
 
-    const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
     *image_layout = VK_IMAGE_LAYOUT_GENERAL;
     image_views.push_back(ImageView{std::move(view), image_layout});
 }
@@ -1231,6 +1365,117 @@ void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs)
     }
 }
 
+void RasterizerVulkan::UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchCullMode()) {
+        return;
+    }
+    scheduler.Record(
+        [enabled = regs.cull_test_enabled, cull_face = regs.cull_face](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetCullModeEXT(enabled ? MaxwellToVK::CullFace(cull_face) : VK_CULL_MODE_NONE);
+        });
+}
+
+void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthBoundsTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthBoundsTestEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_test_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthTestEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthWriteEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.depth_write_enabled](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthWriteEnableEXT(enable);
+    });
+}
+
+void RasterizerVulkan::UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchDepthCompareOp()) {
+        return;
+    }
+    scheduler.Record([func = regs.depth_test_func](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetDepthCompareOpEXT(MaxwellToVK::ComparisonOp(func));
+    });
+}
+
+void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchFrontFace()) {
+        return;
+    }
+
+    VkFrontFace front_face = MaxwellToVK::FrontFace(regs.front_face);
+    if (regs.screen_y_control.triangle_rast_flip != 0) {
+        front_face = front_face == VK_FRONT_FACE_CLOCKWISE ? VK_FRONT_FACE_COUNTER_CLOCKWISE
+                                                           : VK_FRONT_FACE_CLOCKWISE;
+    }
+    scheduler.Record(
+        [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); });
+}
+
+void RasterizerVulkan::UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchPrimitiveTopology()) {
+        return;
+    }
+    const Maxwell::PrimitiveTopology primitive_topology = regs.draw.topology.Value();
+    scheduler.Record([this, primitive_topology](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetPrimitiveTopologyEXT(MaxwellToVK::PrimitiveTopology(device, primitive_topology));
+    });
+}
+
+void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchStencilOp()) {
+        return;
+    }
+    const Maxwell::StencilOp fail = regs.stencil_front_op_fail;
+    const Maxwell::StencilOp zfail = regs.stencil_front_op_zfail;
+    const Maxwell::StencilOp zpass = regs.stencil_front_op_zpass;
+    const Maxwell::ComparisonOp compare = regs.stencil_front_func_func;
+    if (regs.stencil_two_side_enable) {
+        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+        });
+    } else {
+        const Maxwell::StencilOp back_fail = regs.stencil_back_op_fail;
+        const Maxwell::StencilOp back_zfail = regs.stencil_back_op_zfail;
+        const Maxwell::StencilOp back_zpass = regs.stencil_back_op_zpass;
+        const Maxwell::ComparisonOp back_compare = regs.stencil_back_func_func;
+        scheduler.Record([fail, zfail, zpass, compare, back_fail, back_zfail, back_zpass,
+                          back_compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_BIT, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_BACK_BIT, MaxwellToVK::StencilOp(back_fail),
+                                   MaxwellToVK::StencilOp(back_zpass),
+                                   MaxwellToVK::StencilOp(back_zfail),
+                                   MaxwellToVK::ComparisonOp(back_compare));
+        });
+    }
+}
+
+void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) {
+    if (!state_tracker.TouchStencilTestEnable()) {
+        return;
+    }
+    scheduler.Record([enable = regs.stencil_enable](vk::CommandBuffer cmdbuf) {
+        cmdbuf.SetStencilTestEnableEXT(enable);
+    });
+}
+
 std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const {
     std::size_t size = CalculateVertexArraysSize();
     if (is_indexed) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 0ed0e48c6..923178b0b 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -159,7 +159,10 @@ private:
 
     void FlushWork();
 
-    Texceptions UpdateAttachments();
+    /// @brief Updates the currently bound attachments
+    /// @param is_clear True when the framebuffer is updated as a clear
+    /// @return Bitfield of attachments being used as sampled textures
+    Texceptions UpdateAttachments(bool is_clear);
 
     std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
 
@@ -168,7 +171,7 @@ private:
                                  bool is_indexed, bool is_instanced);
 
     /// Setup descriptors in the graphics pipeline.
-    void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders);
+    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
 
     void SetupImageTransitions(Texceptions texceptions,
                                const std::array<View, Maxwell::NumRenderTargets>& color_attachments,
@@ -182,8 +185,7 @@ private:
 
     bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment);
 
-    void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
-                           BufferBindings& buffer_bindings);
+    void SetupVertexArrays(BufferBindings& buffer_bindings);
 
     void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
 
@@ -193,12 +195,15 @@ private:
     /// Setup global buffers in the graphics pipeline.
     void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
 
-    /// Setup texel buffers in the graphics pipeline.
-    void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
+    /// Setup uniform texels in the graphics pipeline.
+    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
 
     /// Setup textures in the graphics pipeline.
     void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
 
+    /// Setup storage texels in the graphics pipeline.
+    void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
+
     /// Setup images in the graphics pipeline.
     void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
 
@@ -209,11 +214,14 @@ private:
     void SetupComputeGlobalBuffers(const ShaderEntries& entries);
 
     /// Setup texel buffers in the compute pipeline.
-    void SetupComputeTexelBuffers(const ShaderEntries& entries);
+    void SetupComputeUniformTexels(const ShaderEntries& entries);
 
     /// Setup textures in the compute pipeline.
     void SetupComputeTextures(const ShaderEntries& entries);
 
+    /// Setup storage texels in the compute pipeline.
+    void SetupComputeStorageTexels(const ShaderEntries& entries);
+
     /// Setup images in the compute pipeline.
     void SetupComputeImages(const ShaderEntries& entries);
 
@@ -222,10 +230,12 @@ private:
 
     void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
 
-    void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
+    void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
 
     void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
 
+    void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
+
     void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
 
     void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -235,6 +245,16 @@ private:
     void UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs);
     void UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs);
 
+    void UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
+    void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
+
     std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
 
     std::size_t CalculateComputeStreamBufferSize() const;
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index e6f2fa553..616eacc36 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -9,6 +9,8 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/textures/texture.h"
 
+using Tegra::Texture::TextureMipmapFilter;
+
 namespace Vulkan {
 
 namespace {
@@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c
     ci.maxAnisotropy = tsc.GetMaxAnisotropy();
     ci.compareEnable = tsc.depth_compare_enabled;
     ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func);
-    ci.minLod = tsc.GetMinLod();
-    ci.maxLod = tsc.GetMaxLod();
+    ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod();
+    ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod();
     ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color);
     ci.unnormalizedCoordinates = VK_FALSE;
     return device.GetLogical().CreateSampler(ci);
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 82ec9180e..56524e6f3 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
 }
 
 void VKScheduler::WorkerThread() {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     std::unique_lock lock{mutex};
     do {
         cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 6f6dedd82..97429cc59 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -400,8 +400,9 @@ private:
         u32 binding = specialization.base_binding;
         binding = DeclareConstantBuffers(binding);
         binding = DeclareGlobalBuffers(binding);
-        binding = DeclareTexelBuffers(binding);
+        binding = DeclareUniformTexels(binding);
         binding = DeclareSamplers(binding);
+        binding = DeclareStorageTexels(binding);
         binding = DeclareImages(binding);
 
         const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -741,8 +742,10 @@ private:
             if (!IsGenericAttribute(index)) {
                 continue;
             }
-
             const u32 location = GetGenericAttributeLocation(index);
+            if (!IsAttributeEnabled(location)) {
+                continue;
+            }
             const auto type_descriptor = GetAttributeType(location);
             Id type;
             if (IsInputAttributeArray()) {
@@ -887,7 +890,7 @@ private:
         return binding;
     }
 
-    u32 DeclareTexelBuffers(u32 binding) {
+    u32 DeclareUniformTexels(u32 binding) {
         for (const auto& sampler : ir.GetSamplers()) {
             if (!sampler.is_buffer) {
                 continue;
@@ -908,7 +911,7 @@ private:
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
 
-            texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id});
+            uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
         }
         return binding;
     }
@@ -943,31 +946,48 @@ private:
         return binding;
     }
 
-    u32 DeclareImages(u32 binding) {
+    u32 DeclareStorageTexels(u32 binding) {
         for (const auto& image : ir.GetImages()) {
-            const auto [dim, arrayed] = GetImageDim(image);
-            constexpr int depth = 0;
-            constexpr bool ms = false;
-            constexpr int sampled = 2; // This won't be accessed with a sampler
-            constexpr auto format = spv::ImageFormat::Unknown;
-            const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
-            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
-            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
-
-            Decorate(id, spv::Decoration::Binding, binding++);
-            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
-            if (image.is_read && !image.is_written) {
-                Decorate(id, spv::Decoration::NonWritable);
-            } else if (image.is_written && !image.is_read) {
-                Decorate(id, spv::Decoration::NonReadable);
+            if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
             }
+            DeclareImage(image, binding);
+        }
+        return binding;
+    }
 
-            images.emplace(image.index, StorageImage{image_type, id});
+    u32 DeclareImages(u32 binding) {
+        for (const auto& image : ir.GetImages()) {
+            if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
+            }
+            DeclareImage(image, binding);
         }
         return binding;
     }
 
+    void DeclareImage(const Image& image, u32& binding) {
+        const auto [dim, arrayed] = GetImageDim(image);
+        constexpr int depth = 0;
+        constexpr bool ms = false;
+        constexpr int sampled = 2; // This won't be accessed with a sampler
+        const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
+        const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
+        const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
+        const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+        AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
+
+        Decorate(id, spv::Decoration::Binding, binding++);
+        Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+        if (image.is_read && !image.is_written) {
+            Decorate(id, spv::Decoration::NonWritable);
+        } else if (image.is_written && !image.is_read) {
+            Decorate(id, spv::Decoration::NonReadable);
+        }
+
+        images.emplace(image.index, StorageImage{image_type, id});
+    }
+
     bool IsRenderTargetEnabled(u32 rt) const {
         for (u32 component = 0; component < 4; ++component) {
             if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -986,6 +1006,10 @@ private:
         return stage == ShaderType::TesselationControl;
     }
 
+    bool IsAttributeEnabled(u32 location) const {
+        return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
+    }
+
     u32 GetNumInputVertices() const {
         switch (stage) {
         case ShaderType::Geometry:
@@ -1201,16 +1225,20 @@ private:
                 UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
                 return {v_float_zero, Type::Float};
             default:
-                if (IsGenericAttribute(attribute)) {
-                    const u32 location = GetGenericAttributeLocation(attribute);
-                    const auto type_descriptor = GetAttributeType(location);
-                    const Type type = type_descriptor.type;
-                    const Id attribute_id = input_attributes.at(attribute);
-                    const std::vector elements = {element};
-                    const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
-                    return {OpLoad(GetTypeDefinition(type), pointer), type};
+                if (!IsGenericAttribute(attribute)) {
+                    break;
                 }
-                break;
+                const u32 location = GetGenericAttributeLocation(attribute);
+                if (!IsAttributeEnabled(location)) {
+                    // Disabled attributes (also known as constant attributes) always return zero.
+                    return {v_float_zero, Type::Float};
+                }
+                const auto type_descriptor = GetAttributeType(location);
+                const Type type = type_descriptor.type;
+                const Id attribute_id = input_attributes.at(attribute);
+                const std::vector elements = {element};
+                const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
+                return {OpLoad(GetTypeDefinition(type), pointer), type};
             }
             UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
             return {v_float_zero, Type::Float};
@@ -1246,7 +1274,7 @@ private:
                 } else {
                     UNREACHABLE_MSG("Unmanaged offset node type");
                 }
-                pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
+                pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
                                         buffer_element);
             }
             return {OpLoad(t_float, pointer), Type::Float};
@@ -1601,7 +1629,7 @@ private:
 
         const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
         const Id carry = OpCompositeExtract(t_uint, result, 1);
-        return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool};
+        return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
     }
 
     Expression LogicalAssign(Operation operation) {
@@ -1664,7 +1692,7 @@ private:
         const auto& meta = std::get<MetaTexture>(operation.GetMeta());
         const u32 index = meta.sampler.index;
         if (meta.sampler.is_buffer) {
-            const auto& entry = texel_buffers.at(index);
+            const auto& entry = uniform_texels.at(index);
             return OpLoad(entry.image_type, entry.image);
         } else {
             const auto& entry = sampled_images.at(index);
@@ -1941,39 +1969,20 @@ private:
         return {};
     }
 
-    Expression AtomicImageAdd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMin(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMax(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageAnd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageOr(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression AtomicImage(Operation operation) {
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+        ASSERT(meta.values.size() == 1);
 
-    Expression AtomicImageXor(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+        const Id coordinate = GetCoordinates(operation, Type::Int);
+        const Id image = images.at(meta.image.index).image;
+        const Id sample = v_uint_zero;
+        const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
 
-    Expression AtomicImageExchange(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
+        const Id semantics = v_uint_zero;
+        const Id value = AsUint(Visit(meta.values[0]));
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
     }
 
     template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1988,7 +1997,7 @@ private:
             return {v_float_zero, Type::Float};
         }
         const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(t_uint, 0);
+        const Id semantics = v_uint_zero;
         const Id value = AsUint(Visit(operation[1]));
 
         return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2612,11 +2621,11 @@ private:
 
         &SPIRVDecompiler::ImageLoad,
         &SPIRVDecompiler::ImageStore,
-        &SPIRVDecompiler::AtomicImageAdd,
-        &SPIRVDecompiler::AtomicImageAnd,
-        &SPIRVDecompiler::AtomicImageOr,
-        &SPIRVDecompiler::AtomicImageXor,
-        &SPIRVDecompiler::AtomicImageExchange,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
 
         &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
         &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2758,8 +2767,11 @@ private:
         Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
     const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
 
+    const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
+
     const Id v_float_zero = Constant(t_float, 0.0f);
     const Id v_float_one = Constant(t_float, 1.0f);
+    const Id v_uint_zero = Constant(t_uint, 0);
 
     // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
     const Id v_varying_default =
@@ -2784,15 +2796,16 @@ private:
     std::unordered_map<u8, GenericVaryingDescription> output_attributes;
     std::map<u32, Id> constant_buffers;
     std::map<GlobalMemoryBase, Id> global_buffers;
-    std::map<u32, TexelBuffer> texel_buffers;
+    std::map<u32, TexelBuffer> uniform_texels;
     std::map<u32, SampledImage> sampled_images;
+    std::map<u32, TexelBuffer> storage_texels;
     std::map<u32, StorageImage> images;
 
+    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id instance_index{};
     Id vertex_index{};
     Id base_instance{};
     Id base_vertex{};
-    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
     Id front_facing{};
@@ -3048,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
     }
     for (const auto& sampler : ir.GetSamplers()) {
         if (sampler.is_buffer) {
-            entries.texel_buffers.emplace_back(sampler);
+            entries.uniform_texels.emplace_back(sampler);
         } else {
             entries.samplers.emplace_back(sampler);
         }
     }
     for (const auto& image : ir.GetImages()) {
-        entries.images.emplace_back(image);
+        if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+            entries.storage_texels.emplace_back(image);
+        } else {
+            entries.images.emplace_back(image);
+        }
     }
     for (const auto& attribute : ir.GetInputAttributes()) {
         if (IsGenericAttribute(attribute)) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index f4c05ac3c..2b0e90396 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -21,8 +21,9 @@ class VKDevice;
 namespace Vulkan {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using TexelBufferEntry = VideoCommon::Shader::Sampler;
+using UniformTexelEntry = VideoCommon::Shader::Sampler;
 using SamplerEntry = VideoCommon::Shader::Sampler;
+using StorageTexelEntry = VideoCommon::Shader::Image;
 using ImageEntry = VideoCommon::Shader::Image;
 
 constexpr u32 DESCRIPTOR_SET = 0;
@@ -66,13 +67,15 @@ private:
 struct ShaderEntries {
     u32 NumBindings() const {
         return static_cast<u32>(const_buffers.size() + global_buffers.size() +
-                                texel_buffers.size() + samplers.size() + images.size());
+                                uniform_texels.size() + samplers.size() + storage_texels.size() +
+                                images.size());
     }
 
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<GlobalBufferEntry> global_buffers;
-    std::vector<TexelBufferEntry> texel_buffers;
+    std::vector<UniformTexelEntry> uniform_texels;
     std::vector<SamplerEntry> samplers;
+    std::vector<StorageTexelEntry> storage_texels;
     std::vector<ImageEntry> images;
     std::set<u32> attributes;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -88,7 +91,8 @@ struct Specialization final {
     u32 shared_memory_size{};
 
     // Graphics specific
-    std::optional<float> point_size{};
+    std::optional<float> point_size;
+    std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
     std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
     bool ndc_minus_one_to_one{};
 };
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index 94a89e388..e5a583dd5 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -36,6 +36,15 @@ Flags MakeInvalidationFlags() {
     flags[BlendConstants] = true;
     flags[DepthBounds] = true;
     flags[StencilProperties] = true;
+    flags[CullMode] = true;
+    flags[DepthBoundsEnable] = true;
+    flags[DepthTestEnable] = true;
+    flags[DepthWriteEnable] = true;
+    flags[DepthCompareOp] = true;
+    flags[FrontFace] = true;
+    flags[PrimitiveTopology] = true;
+    flags[StencilOp] = true;
+    flags[StencilTestEnable] = true;
     return flags;
 }
 
@@ -75,6 +84,57 @@ void SetupDirtyStencilProperties(Tables& tables) {
     table[OFF(stencil_back_func_mask)] = StencilProperties;
 }
 
+void SetupDirtyCullMode(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(cull_face)] = CullMode;
+    table[OFF(cull_test_enabled)] = CullMode;
+}
+
+void SetupDirtyDepthBoundsEnable(Tables& tables) {
+    tables[0][OFF(depth_bounds_enable)] = DepthBoundsEnable;
+}
+
+void SetupDirtyDepthTestEnable(Tables& tables) {
+    tables[0][OFF(depth_test_enable)] = DepthTestEnable;
+}
+
+void SetupDirtyDepthWriteEnable(Tables& tables) {
+    tables[0][OFF(depth_write_enabled)] = DepthWriteEnable;
+}
+
+void SetupDirtyDepthCompareOp(Tables& tables) {
+    tables[0][OFF(depth_test_func)] = DepthCompareOp;
+}
+
+void SetupDirtyFrontFace(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(front_face)] = FrontFace;
+    table[OFF(screen_y_control)] = FrontFace;
+}
+
+void SetupDirtyPrimitiveTopology(Tables& tables) {
+    tables[0][OFF(draw.topology)] = PrimitiveTopology;
+}
+
+void SetupDirtyStencilOp(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(stencil_front_op_fail)] = StencilOp;
+    table[OFF(stencil_front_op_zfail)] = StencilOp;
+    table[OFF(stencil_front_op_zpass)] = StencilOp;
+    table[OFF(stencil_front_func_func)] = StencilOp;
+    table[OFF(stencil_back_op_fail)] = StencilOp;
+    table[OFF(stencil_back_op_zfail)] = StencilOp;
+    table[OFF(stencil_back_op_zpass)] = StencilOp;
+    table[OFF(stencil_back_func_func)] = StencilOp;
+
+    // Table 0 is used by StencilProperties
+    tables[1][OFF(stencil_two_side_enable)] = StencilOp;
+}
+
+void SetupDirtyStencilTestEnable(Tables& tables) {
+    tables[0][OFF(stencil_enable)] = StencilTestEnable;
+}
+
 } // Anonymous namespace
 
 StateTracker::StateTracker(Core::System& system)
@@ -90,6 +150,14 @@ void StateTracker::Initialize() {
     SetupDirtyBlendConstants(tables);
     SetupDirtyDepthBounds(tables);
     SetupDirtyStencilProperties(tables);
+    SetupDirtyCullMode(tables);
+    SetupDirtyDepthBoundsEnable(tables);
+    SetupDirtyDepthTestEnable(tables);
+    SetupDirtyDepthWriteEnable(tables);
+    SetupDirtyDepthCompareOp(tables);
+    SetupDirtyFrontFace(tables);
+    SetupDirtyPrimitiveTopology(tables);
+    SetupDirtyStencilOp(tables);
 }
 
 void StateTracker::InvalidateCommandBufferState() {
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index 03bc415b2..54ca0d6c6 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -26,6 +26,16 @@ enum : u8 {
     DepthBounds,
     StencilProperties,
 
+    CullMode,
+    DepthBoundsEnable,
+    DepthTestEnable,
+    DepthWriteEnable,
+    DepthCompareOp,
+    FrontFace,
+    PrimitiveTopology,
+    StencilOp,
+    StencilTestEnable,
+
     Last
 };
 static_assert(Last <= std::numeric_limits<u8>::max());
@@ -64,6 +74,46 @@ public:
         return Exchange(Dirty::StencilProperties, false);
     }
 
+    bool TouchCullMode() {
+        return Exchange(Dirty::CullMode, false);
+    }
+
+    bool TouchDepthBoundsTestEnable() {
+        return Exchange(Dirty::DepthBoundsEnable, false);
+    }
+
+    bool TouchDepthTestEnable() {
+        return Exchange(Dirty::DepthTestEnable, false);
+    }
+
+    bool TouchDepthBoundsEnable() {
+        return Exchange(Dirty::DepthBoundsEnable, false);
+    }
+
+    bool TouchDepthWriteEnable() {
+        return Exchange(Dirty::DepthWriteEnable, false);
+    }
+
+    bool TouchDepthCompareOp() {
+        return Exchange(Dirty::DepthCompareOp, false);
+    }
+
+    bool TouchFrontFace() {
+        return Exchange(Dirty::FrontFace, false);
+    }
+
+    bool TouchPrimitiveTopology() {
+        return Exchange(Dirty::PrimitiveTopology, false);
+    }
+
+    bool TouchStencilOp() {
+        return Exchange(Dirty::StencilOp, false);
+    }
+
+    bool TouchStencilTestEnable() {
+        return Exchange(Dirty::StencilTestEnable, false);
+    }
+
 private:
     bool Exchange(std::size_t id, bool new_value) const noexcept {
         auto& flags = system.GPU().Maxwell3D().dirty.flags;
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
index 868447af2..2d28a6c47 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -121,7 +121,7 @@ void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) {
 
     // Substract from the preferred heap size some bytes to avoid getting out of memory.
     const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
-    const VkDeviceSize allocable_size = heap_size - 4 * 1024 * 1024;
+    const VkDeviceSize allocable_size = heap_size - 9 * 1024 * 1024;
 
     VkBufferCreateInfo buffer_ci;
     buffer_ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index dfddf7ad6..689f0d276 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,10 +35,14 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const noexcept {
         return *buffer;
     }
 
+    u64 Address() const noexcept {
+        return 0;
+    }
+
 private:
     struct Watch final {
         VKFenceWatch fence;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 2f1d5021d..430031665 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
     ci.pNext = nullptr;
     ci.flags = 0;
     ci.size = static_cast<VkDeviceSize>(host_memory_size);
-    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
-               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
+               VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
     ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
     ci.queueFamilyIndexCount = 0;
     ci.pQueueFamilyIndices = nullptr;
@@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
         ci.extent = {params.width, params.height, 1};
         break;
     case SurfaceTarget::Texture3D:
+        ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
         ci.extent = {params.width, params.height, params.depth};
         break;
     case SurfaceTarget::TextureBuffer:
@@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
     return ci;
 }
 
+u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
+                  Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace
 
 CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
@@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
     }
 
     // TODO(Rodrigo): Move this to a virtual function.
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
-        true);
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+    main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {
 }
 
 View CachedSurface::CreateView(const ViewParams& params) {
-    return CreateViewInner(params, false);
-}
-
-View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
     // TODO(Rodrigo): Add name decorations
-    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
 }
 
 void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
 }
 
 CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                                     const ViewParams& params, bool is_proxy)
+                                     const ViewParams& params)
     : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
       image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
       aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
-      base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
-      num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
-                                                           : VK_IMAGE_VIEW_TYPE_1D} {}
+      base_level{params.base_level}, num_levels{params.num_levels},
+      image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        base_layer = 0;
+        num_layers = 1;
+        base_slice = params.base_layer;
+        num_slices = params.num_layers;
+    } else {
+        base_layer = params.base_layer;
+        num_layers = params.num_layers;
+    }
+}
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
-                                         SwizzleSource z_source, SwizzleSource w_source) {
+VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
+                                            SwizzleSource z_source, SwizzleSource w_source) {
     const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
     if (last_image_view && last_swizzle == new_swizzle) {
         return last_image_view;
@@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
             });
     }
 
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ASSERT(base_slice == 0);
+        ASSERT(num_slices == params.depth);
+    }
+
     VkImageViewCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
     ci.pNext = nullptr;
@@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
     return last_image_view = *image_view;
 }
 
+VkImageView CachedSurfaceView::GetAttachment() {
+    if (render_target) {
+        return *render_target;
+    }
+
+    VkImageViewCreateInfo ci;
+    ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    ci.pNext = nullptr;
+    ci.flags = 0;
+    ci.image = surface.GetImageHandle();
+    ci.format = surface.GetImage().GetFormat();
+    ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
+                     VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
+    ci.subresourceRange.aspectMask = aspect_mask;
+    ci.subresourceRange.baseMipLevel = base_level;
+    ci.subresourceRange.levelCount = num_levels;
+    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+        ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
+        ci.subresourceRange.baseArrayLayer = base_slice;
+        ci.subresourceRange.layerCount = num_slices;
+    } else {
+        ci.viewType = image_view_type;
+        ci.subresourceRange.baseArrayLayer = base_layer;
+        ci.subresourceRange.layerCount = num_layers;
+    }
+    render_target = device.GetLogical().CreateImageView(ci);
+    return *render_target;
+}
+
 VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                                const VKDevice& device, VKResourceManager& resource_manager,
                                VKMemoryManager& memory_manager, VKScheduler& scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f211ccb1e..807e26c8a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -91,7 +91,6 @@ protected:
     void DecorateSurfaceName();
 
     View CreateView(const ViewParams& params) override;
-    View CreateViewInner(const ViewParams& params, bool is_proxy);
 
 private:
     void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -120,23 +119,20 @@ private:
 class CachedSurfaceView final : public VideoCommon::ViewBase {
 public:
     explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
-                               const ViewParams& params, bool is_proxy);
+                               const ViewParams& params);
     ~CachedSurfaceView();
 
-    VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
-                          Tegra::Texture::SwizzleSource y_source,
-                          Tegra::Texture::SwizzleSource z_source,
-                          Tegra::Texture::SwizzleSource w_source);
+    VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
+                             Tegra::Texture::SwizzleSource y_source,
+                             Tegra::Texture::SwizzleSource z_source,
+                             Tegra::Texture::SwizzleSource w_source);
+
+    VkImageView GetAttachment();
 
     bool IsSameSurface(const CachedSurfaceView& rhs) const {
         return &surface == &rhs.surface;
     }
 
-    VkImageView GetHandle() {
-        return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
-                         Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
-    }
-
     u32 GetWidth() const {
         return params.GetMipWidth(base_level);
     }
@@ -180,14 +176,6 @@ public:
     }
 
 private:
-    static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                             Tegra::Texture::SwizzleSource y_source,
-                             Tegra::Texture::SwizzleSource z_source,
-                             Tegra::Texture::SwizzleSource w_source) {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
     // Store a copy of these values to avoid double dereference when reading them
     const SurfaceParams params;
     const VkImage image;
@@ -196,15 +184,18 @@ private:
 
     const VKDevice& device;
     CachedSurface& surface;
-    const u32 base_layer;
-    const u32 num_layers;
     const u32 base_level;
     const u32 num_levels;
     const VkImageViewType image_view_type;
+    u32 base_layer = 0;
+    u32 num_layers = 0;
+    u32 base_slice = 0;
+    u32 num_slices = 0;
 
     VkImageView last_image_view = nullptr;
     u32 last_swizzle = 0;
 
+    vk::ImageView render_target;
     std::unordered_map<u32, vk::ImageView> view_cache;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 681ecde98..351c048d2 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {
 }
 
 void VKUpdateDescriptorQueue::Acquire() {
-    entries.clear();
-}
+    // Minimum number of entries required.
+    // This is the maximum number of entries a single draw call migth use.
+    static constexpr std::size_t MIN_ENTRIES = 0x400;
 
-void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
-                                   VkDescriptorSet set) {
-    if (payload.size() + entries.size() >= payload.max_size()) {
+    if (payload.size() + MIN_ENTRIES >= payload.max_size()) {
         LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
         scheduler.WaitWorker();
         payload.clear();
     }
+    upload_start = &*payload.end();
+}
 
-    // TODO(Rodrigo): Rework to write the payload directly
-    const auto payload_start = payload.data() + payload.size();
-    for (const auto& entry : entries) {
-        if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) {
-            payload.push_back(*image);
-        } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) {
-            payload.push_back(*buffer);
-        } else if (const auto texel = std::get_if<VkBufferView>(&entry)) {
-            payload.push_back(*texel);
-        } else {
-            UNREACHABLE();
-        }
-    }
-
-    scheduler.Record(
-        [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) {
-            logical->UpdateDescriptorSet(set, update_template, payload_start);
-        });
+void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
+                                   VkDescriptorSet set) {
+    const void* const data = upload_start;
+    const vk::Device* const logical = &device.GetLogical();
+    scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) {
+        logical->UpdateDescriptorSet(set, update_template, data);
+    });
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h
index cc7e3dff4..945320c72 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -15,17 +15,13 @@ namespace Vulkan {
 class VKDevice;
 class VKScheduler;
 
-class DescriptorUpdateEntry {
-public:
-    explicit DescriptorUpdateEntry() {}
-
-    DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {}
+struct DescriptorUpdateEntry {
+    DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {}
 
-    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {}
+    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {}
 
-    DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {}
+    DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {}
 
-private:
     union {
         VkDescriptorImageInfo image;
         VkDescriptorBufferInfo buffer;
@@ -45,32 +41,34 @@ public:
     void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);
 
     void AddSampledImage(VkSampler sampler, VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
     }
 
     void AddImage(VkImageView image_view) {
-        entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
+        payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
     }
 
     void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) {
-        entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
+        payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
     }
 
     void AddTexelBuffer(VkBufferView texel_buffer) {
-        entries.emplace_back(texel_buffer);
+        payload.emplace_back(texel_buffer);
     }
 
-    VkImageLayout* GetLastImageLayout() {
-        return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout;
+    VkImageLayout* LastImageLayout() {
+        return &payload.back().image.imageLayout;
     }
 
-private:
-    using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>;
+    const VkImageLayout* LastImageLayout() const {
+        return &payload.back().image.imageLayout;
+    }
 
+private:
     const VKDevice& device;
     VKScheduler& scheduler;
 
-    boost::container::static_vector<Variant, 0x400> entries;
+    const DescriptorUpdateEntry* upload_start = nullptr;
     boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;
 };
 
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 2ce9b0626..051298cc8 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -88,6 +88,16 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkCmdSetStencilWriteMask);
     X(vkCmdSetViewport);
     X(vkCmdWaitEvents);
+    X(vkCmdBindVertexBuffers2EXT);
+    X(vkCmdSetCullModeEXT);
+    X(vkCmdSetDepthBoundsTestEnableEXT);
+    X(vkCmdSetDepthCompareOpEXT);
+    X(vkCmdSetDepthTestEnableEXT);
+    X(vkCmdSetDepthWriteEnableEXT);
+    X(vkCmdSetFrontFaceEXT);
+    X(vkCmdSetPrimitiveTopologyEXT);
+    X(vkCmdSetStencilOpEXT);
+    X(vkCmdSetStencilTestEnableEXT);
     X(vkCreateBuffer);
     X(vkCreateBufferView);
     X(vkCreateCommandPool);
@@ -153,7 +163,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
 
 bool Load(InstanceDispatch& dld) noexcept {
 #define X(name) Proc(dld.name, dld, #name)
-    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+           X(vkEnumerateInstanceLayerProperties);
 #undef X
 }
 
@@ -725,8 +736,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s
     return supported == VK_TRUE;
 }
 
-VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const
-    noexcept {
+VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {
     VkSurfaceCapabilitiesKHR capabilities;
     Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));
     return capabilities;
@@ -771,4 +781,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
     return properties;
 }
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld) {
+    u32 num;
+    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    std::vector<VkLayerProperties> properties(num);
+    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    return properties;
+}
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index 98937a77a..71daac9d7 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
     PFN_vkCreateInstance vkCreateInstance;
     PFN_vkDestroyInstance vkDestroyInstance;
     PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
 
     PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
     PFN_vkCreateDevice vkCreateDevice;
@@ -206,6 +207,16 @@ struct DeviceDispatch : public InstanceDispatch {
     PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask;
     PFN_vkCmdSetViewport vkCmdSetViewport;
     PFN_vkCmdWaitEvents vkCmdWaitEvents;
+    PFN_vkCmdBindVertexBuffers2EXT vkCmdBindVertexBuffers2EXT;
+    PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT;
+    PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT;
+    PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT;
+    PFN_vkCmdSetDepthTestEnableEXT vkCmdSetDepthTestEnableEXT;
+    PFN_vkCmdSetDepthWriteEnableEXT vkCmdSetDepthWriteEnableEXT;
+    PFN_vkCmdSetFrontFaceEXT vkCmdSetFrontFaceEXT;
+    PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT;
+    PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT;
+    PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT;
     PFN_vkCreateBuffer vkCreateBuffer;
     PFN_vkCreateBufferView vkCreateBufferView;
     PFN_vkCreateCommandPool vkCreateCommandPool;
@@ -779,7 +790,7 @@ public:
 
     bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const;
 
-    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept;
+    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;
 
     std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const;
 
@@ -968,6 +979,50 @@ public:
                              buffer_barriers.data(), image_barriers.size(), image_barriers.data());
     }
 
+    void BindVertexBuffers2EXT(u32 first_binding, u32 binding_count, const VkBuffer* buffers,
+                               const VkDeviceSize* offsets, const VkDeviceSize* sizes,
+                               const VkDeviceSize* strides) const noexcept {
+        dld->vkCmdBindVertexBuffers2EXT(handle, first_binding, binding_count, buffers, offsets,
+                                        sizes, strides);
+    }
+
+    void SetCullModeEXT(VkCullModeFlags cull_mode) const noexcept {
+        dld->vkCmdSetCullModeEXT(handle, cull_mode);
+    }
+
+    void SetDepthBoundsTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthBoundsTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetDepthCompareOpEXT(VkCompareOp compare_op) const noexcept {
+        dld->vkCmdSetDepthCompareOpEXT(handle, compare_op);
+    }
+
+    void SetDepthTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetDepthWriteEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetDepthWriteEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
+    void SetFrontFaceEXT(VkFrontFace front_face) const noexcept {
+        dld->vkCmdSetFrontFaceEXT(handle, front_face);
+    }
+
+    void SetPrimitiveTopologyEXT(VkPrimitiveTopology primitive_topology) const noexcept {
+        dld->vkCmdSetPrimitiveTopologyEXT(handle, primitive_topology);
+    }
+
+    void SetStencilOpEXT(VkStencilFaceFlags face_mask, VkStencilOp fail_op, VkStencilOp pass_op,
+                         VkStencilOp depth_fail_op, VkCompareOp compare_op) const noexcept {
+        dld->vkCmdSetStencilOpEXT(handle, face_mask, fail_op, pass_op, depth_fail_op, compare_op);
+    }
+
+    void SetStencilTestEnableEXT(bool enable) const noexcept {
+        dld->vkCmdSetStencilTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE);
+    }
+
     void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers,
                                          const VkDeviceSize* offsets,
                                          const VkDeviceSize* sizes) const noexcept {
@@ -996,4 +1051,7 @@ private:
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld);
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz == 0) {
-        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    PredCondition cond;
+    bool bf;
+    bool ftz;
+    bool neg_a;
+    bool abs_a;
+    bool neg_b;
+    bool abs_b;
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_C:
+    case OpCode::Id::HSET2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        bf = instr.Bit(53);
+        ftz = instr.Bit(54);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(56);
+        abs_b = instr.Bit(54);
+        break;
+    case OpCode::Id::HSET2_R:
+        cond = instr.hsetp2.reg.cond;
+        bf = instr.Bit(49);
+        ftz = instr.Bit(50);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(31);
+        abs_b = instr.Bit(30);
+        break;
+    default:
+        UNREACHABLE();
     }
 
-    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
-    Node op_b = [&]() {
+    Node op_b = [this, instr, opcode] {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::HSET2_C:
+            // Inform as unimplemented as this is not tested.
+            UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         case OpCode::Id::HSET2_R:
             return GetRegister(instr.gpr20);
+        case OpCode::Id::HSET2_IMM:
+            return UnpackHalfImmediate(instr, true);
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return Node{};
         }
     }();
-    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
-    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
-    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+    if (!ftz) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_R:
+        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+        [[fallthrough]];
+    case OpCode::Id::HSET2_C:
+        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+        break;
+    default:
+        break;
+    }
 
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
     // HSET2 operates on each half float in the pack.
     std::array<Node, 2> values;
     for (u32 i = 0; i < 2; ++i) {
-        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
-        const Node true_value = Immediate(raw_value << (i * 16));
-        const Node false_value = Immediate(0);
-
-        const Node comparison =
-            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
-        const Node predicate = Operation(combiner, comparison, second_pred);
+        const u32 raw_value = bf ? 0x3c00 : 0xffff;
+        Node true_value = Immediate(raw_value << (i * 16));
+        Node false_value = Immediate(0);
 
+        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+        Node predicate = Operation(combiner, comparison, second_pred);
         values[i] =
-            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
     }
 
-    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
-    SetRegister(bb, instr.gpr0, value);
+    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+    SetRegister(bb, instr.gpr0, move(value));
 
     return pc;
 }
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 60b6ad72a..07778dc3e 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         break;
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         if (component == 0) {
             return descriptor.b_type;
         }
@@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         }
         break;
     }
-    UNIMPLEMENTED_MSG("texture format not implement={}", format);
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
 }
 
@@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 6;
         }
         return 0;
+    case TextureFormat::BF10GF11RF11:
+        if (component == 1 || component == 2) {
+            return 11;
+        }
+        if (component == 0) {
+            return 10;
+        }
+        return 0;
     case TextureFormat::G8R24:
         if (component == 0) {
             return 8;
@@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return 0;
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return 0;
 }
 
 std::size_t GetImageComponentMask(TextureFormat format) {
@@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R32_B24G8:
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         return std::size_t{R | G | B};
     case TextureFormat::R32_G32:
     case TextureFormat::R16_G16:
@@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R8:
     case TextureFormat::R1:
         return std::size_t{R};
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return std::size_t{R | G | B | A};
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return std::size_t{R | G | B | A};
 }
 
 std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
             return {std::move(original_value), true};
         }
     default:
-        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
         return {std::move(original_value), true};
     }
 }
@@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
             default:
                 break;
             }
-            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
                               static_cast<u64>(instr.suatom_d.operation.Value()),
                               static_cast<u64>(instr.suatom_d.operation_type.Value()));
             return OperationCode::AtomicImageAdd;
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d00e10913..c0a8f233f 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                 return Operation(OperationCode::YNegate);
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
-                return Immediate(0U);
+                return Immediate(0x00ff'0000U);
             case SystemVariable::WscaleFactorXY:
                 UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
                 return Immediate(0U);
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 8f0bb996e..29ebf65ba 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
-                                               std::optional<u32> buffer) {
+ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
+    SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
     if (info.IsComplete()) {
         return info;
     }
-    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
-                                : registry.ObtainBoundSampler(offset);
     if (!sampler) {
         LOG_WARNING(HW_GPU, "Unknown sampler info");
         info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
@@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
 
 std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
                                             SamplerInfo sampler_info) {
-    const auto offset = static_cast<u32>(sampler.index.Value());
-    const auto info = GetSamplerInfo(sampler_info, offset);
+    const u32 offset = static_cast<u32>(sampler.index.Value());
+    const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
 
     // If this sampler has already been used, return the existing mapping.
     const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
@@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
     const Node sampler_register = GetRegister(reg);
     const auto [base_node, tracked_sampler_info] =
         TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT(base_node != nullptr);
-    if (base_node == nullptr) {
+    if (!base_node) {
+        UNREACHABLE();
         return std::nullopt;
     }
 
-    if (const auto bindless_sampler_info =
-            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
-        const u32 buffer = bindless_sampler_info->GetIndex();
-        const u32 offset = bindless_sampler_info->GetOffset();
-        info = GetSamplerInfo(info, offset, buffer);
+    if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+        const u32 buffer = sampler_info->index;
+        const u32 offset = sampler_info->offset;
+        info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
-                                     [buffer = buffer, offset = offset](const Sampler& entry) {
+                                     [buffer, offset](const Sampler& entry) {
                                          return entry.buffer == buffer && entry.offset == offset;
                                      });
         if (it != used_samplers.end()) {
@@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
         return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
                                           *info.is_shadow, *info.is_buffer, false);
     }
-    if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
-        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
-        index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
-        info = GetSamplerInfo(info, base_offset);
+    if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
+        const std::pair indices = sampler_info->indices;
+        const std::pair offsets = sampler_info->offsets;
+        info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
+
+        // Try to use an already created sampler if it exists
+        const auto it = std::find_if(
+            used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
+                return offsets == std::pair{entry.offset, entry.secondary_offset} &&
+                       indices == std::pair{entry.buffer, entry.secondary_buffer};
+            });
+        if (it != used_samplers.end()) {
+            ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+            return *it;
+        }
+
+        // Otherwise create a new mapping for this sampler
+        const u32 next_index = static_cast<u32>(used_samplers.size());
+        return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer);
+    }
+    if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+        const u32 base_offset = sampler_info->base_offset / 4;
+        index_var = GetCustomVariable(sampler_info->bindless_var);
+        info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
index 074f21691..5071c83ca 100644
--- a/src/video_core/shader/memory_util.cpp
+++ b/src/video_core/shader/memory_util.cpp
@@ -66,12 +66,12 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_add
 
 u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
                         const ProgramCode& code_b) {
-    u64 unique_identifier = boost::hash_value(code);
+    size_t unique_identifier = boost::hash_value(code);
     if (is_a) {
         // VertexA programs include two programs
         boost::hash_combine(unique_identifier, boost::hash_value(code_b));
     }
-    return unique_identifier;
+    return static_cast<u64>(unique_identifier);
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c5e5165ff..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
 
-class BindlessSamplerNode;
-class ArraySamplerNode;
+struct ArraySamplerNode;
+struct BindlessSamplerNode;
+struct SeparateSamplerNode;
 
-using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
 using TrackSampler = std::shared_ptr<TrackSamplerData>;
 
 struct Sampler {
@@ -288,63 +289,51 @@ struct Sampler {
         : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
           is_buffer{is_buffer}, is_indexed{is_indexed} {}
 
+    /// Separate sampler constructor
+    constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
+                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
+                               bool is_buffer)
+        : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
+          buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
+          is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
+
     /// Bindless samplers constructor
     constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
                                bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
           is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
 
-    u32 index = 0;  ///< Emulated index given for the this sampler.
-    u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
-    u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size = 1;   ///< Size of the sampler.
+    u32 index = 0;            ///< Emulated index given for the this sampler.
+    u32 offset = 0;           ///< Offset in the const buffer from where the sampler is being read.
+    u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
+    u32 buffer = 0;           ///< Buffer where the bindless sampler is read.
+    u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
+    u32 size = 1;             ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array = false;    ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow = false;   ///< Whether the texture is being sampled as a depth texture or not.
-    bool is_buffer = false;   ///< Whether the texture is a texture buffer without sampler.
-    bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
-    bool is_indexed = false;  ///< Whether this sampler is an indexed array of textures.
+    bool is_array = false;     ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow = false;    ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_buffer = false;    ///< Whether the texture is a texture buffer without sampler.
+    bool is_bindless = false;  ///< Whether this sampler belongs to a bindless texture or not.
+    bool is_indexed = false;   ///< Whether this sampler is an indexed array of textures.
+    bool is_separated = false; ///< Whether the image and sampler is separated or not.
 };
 
 /// Represents a tracked bindless sampler into a direct const buffer
-class ArraySamplerNode final {
-public:
-    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
-        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetBaseOffset() const {
-        return base_offset;
-    }
-
-    constexpr u32 GetIndexVar() const {
-        return bindless_var;
-    }
-
-private:
+struct ArraySamplerNode {
     u32 index;
     u32 base_offset;
     u32 bindless_var;
 };
 
-/// Represents a tracked bindless sampler into a direct const buffer
-class BindlessSamplerNode final {
-public:
-    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
+/// Represents a tracked separate sampler image pair that was folded statically
+struct SeparateSamplerNode {
+    std::pair<u32, u32> indices;
+    std::pair<u32, u32> offsets;
+};
 
-private:
+/// Represents a tracked bindless sampler into a direct const buffer
+struct BindlessSamplerNode {
     u32 index;
     u32 offset;
 };
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
 template <typename T, typename... Args>
 TrackSampler MakeTrackSampler(Args&&... args) {
     static_assert(std::is_convertible_v<T, TrackSamplerData>);
-    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+    return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
 }
 
 template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..cdf274e54 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
     return value;
 }
 
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
+    std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
+    SeparateSamplerKey key;
+    key.buffers = buffers;
+    key.offsets = offsets;
+    const auto iter = separate_samplers.find(key);
+    if (iter != separate_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+
+    const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
+    const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
+    const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
+    separate_samplers.emplace(key, value);
+    return value;
+}
+
 std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
                                                                                  u32 offset) {
     const std::pair key = {buffer, offset};
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..231206765 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
 
 namespace VideoCommon::Shader {
 
+struct SeparateSamplerKey {
+    std::pair<u32, u32> buffers;
+    std::pair<u32, u32> offsets;
+};
+
+} // namespace VideoCommon::Shader
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::Shader::SeparateSamplerKey> {
+    std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
+        return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
+                                key.offsets.second);
+    }
+};
+
+template <>
+struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
+    bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
+                    const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
+        return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
+    }
+};
+
+} // namespace std
+
+namespace VideoCommon::Shader {
+
 using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
 using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using SeparateSamplerMap =
+    std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
 using BindlessSamplerMap =
     std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
 
@@ -73,6 +104,9 @@ public:
 
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
 
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
+        std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
+
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
 
     /// Inserts a key.
@@ -128,6 +162,7 @@ private:
     Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
     KeyMap keys;
     BoundSamplerMap bound_samplers;
+    SeparateSamplerMap separate_samplers;
     BindlessSamplerMap bindless_samplers;
     u32 bound_buffer;
     GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 15ae152f2..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -330,8 +330,8 @@ private:
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
 
     /// Queries the missing sampler info from the execution context.
-    SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset,
-                               std::optional<u32> buffer = std::nullopt);
+    SamplerInfo GetSamplerInfo(SamplerInfo info,
+                               std::optional<Tegra::Engines::SamplerDescriptor> sampler);
 
     /// Accesses a texture sampler.
     std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
@@ -409,8 +409,14 @@ private:
 
     std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                        s64 cursor);
+    std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                       s64 cursor);
+
+    std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
+                                                             const OperationNode& operation,
+                                                             Node gpr, Node base_offset,
+                                                             Node tracked, const NodeBlock& code,
+                                                             s64 cursor);
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index eb97bfd41..d5ed81442 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
 namespace VideoCommon::Shader {
 
 namespace {
+
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
@@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
     if (const auto operation = std::get_if<OperationNode>(&*node)) {
         operation->SetAmendIndex(amend_index);
         return true;
-    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+    }
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
         conditional->SetAmendIndex(amend_index);
         return true;
     }
@@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
 
 } // Anonymous namespace
 
-std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                              s64 cursor) {
+std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                             s64 cursor) {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+        const u32 cbuf_index = cbuf->GetIndex();
+
         // Constant buffer found, test if it's an immediate
         const auto& offset = cbuf->GetOffset();
         if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
-            auto track =
-                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+            auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
             return {tracked, track};
         }
         if (const auto operation = std::get_if<OperationNode>(&*offset)) {
             const u32 bound_buffer = registry.GetBoundBuffer();
-            if (bound_buffer != cbuf->GetIndex()) {
+            if (bound_buffer != cbuf_index) {
                 return {};
             }
-            const auto pair = DecoupleIndirectRead(*operation);
-            if (!pair) {
-                return {};
+            if (const std::optional pair = DecoupleIndirectRead(*operation)) {
+                auto [gpr, base_offset] = *pair;
+                return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
+                                                  code, cursor);
             }
-            auto [gpr, base_offset] = *pair;
-            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            const auto& gpu_driver = registry.AccessGuestDriverProfile();
-            const u32 bindless_cv = NewCustomVariable();
-            Node op =
-                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
-
-            const Node cv_node = GetCustomVariable(bindless_cv);
-            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
-            const std::size_t amend_index = DeclareAmend(std::move(amend_op));
-            AmendNodeCv(amend_index, code[cursor]);
-            // TODO Implement Bindless Index custom variable
-            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
-                                                            offset_inm->GetValue(), bindless_cv);
-            return {tracked, track};
         }
         return {};
     }
@@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
         return TrackBindlessSampler(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
-            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
-                std::get<0>(found)) {
-                // Cbuf found in operand.
+        const OperationNode& op = *operation;
+
+        const OperationCode opcode = operation->GetCode();
+        if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
+            ASSERT(op.GetOperandsCount() == 2);
+            auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
+            auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
+            if (node_a && node_b) {
+                auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
+                                                                   std::pair{offset_a, offset_b});
+                return {tracked, std::move(track)};
+            }
+        }
+        std::size_t i = op.GetOperandsCount();
+        while (i--) {
+            if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
+                // Constant buffer found in operand.
                 return found;
             }
         }
@@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
     return {};
 }
 
+std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
+    const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
+    const NodeBlock& code, s64 cursor) {
+    const auto offset_imm = std::get<ImmediateNode>(*base_offset);
+    const auto& gpu_driver = registry.AccessGuestDriverProfile();
+    const u32 bindless_cv = NewCustomVariable();
+    const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
+    Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
+
+    Node cv_node = GetCustomVariable(bindless_cv);
+    Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
+    const std::size_t amend_index = DeclareAmend(std::move(amend_op));
+    AmendNodeCv(amend_index, code[cursor]);
+
+    // TODO: Implement bindless index custom variable
+    auto track =
+        MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
+    return {tracked, track};
+}
+
 std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
                                                s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
new file mode 100644
index 000000000..b7608fc7b
--- /dev/null
+++ b/src/video_core/shader_cache.h
@@ -0,0 +1,240 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class T>
+class ShaderCache {
+    static constexpr u64 PAGE_BITS = 14;
+    static constexpr u64 PAGE_SIZE = u64(1) << PAGE_BITS;
+
+    struct Entry {
+        VAddr addr_start;
+        VAddr addr_end;
+        T* data;
+
+        bool is_memory_marked = true;
+
+        constexpr bool Overlaps(VAddr start, VAddr end) const noexcept {
+            return start < addr_end && addr_start < end;
+        }
+    };
+
+public:
+    virtual ~ShaderCache() = default;
+
+    /// @brief Removes shaders inside a given region
+    /// @note Checks for ranges
+    /// @param addr Start address of the invalidation
+    /// @param size Number of bytes of the invalidation
+    void InvalidateRegion(VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+        RemovePendingShaders();
+    }
+
+    /// @brief Unmarks a memory region as cached and marks it for removal
+    /// @param addr Start address of the CPU write operation
+    /// @param size Number of bytes of the CPU write operation
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{invalidation_mutex};
+        InvalidatePagesInRegion(addr, size);
+    }
+
+    /// @brief Flushes delayed removal operations
+    void SyncGuestHost() {
+        std::scoped_lock lock{invalidation_mutex};
+        RemovePendingShaders();
+    }
+
+    /// @brief Tries to obtain a cached shader starting in a given address
+    /// @note Doesn't check for ranges, the given address has to be the start of the shader
+    /// @param addr Start address of the shader, this doesn't cache for region
+    /// @return Pointer to a valid shader, nullptr when nothing is found
+    T* TryGet(VAddr addr) const {
+        std::scoped_lock lock{lookup_mutex};
+
+        const auto it = lookup_cache.find(addr);
+        if (it == lookup_cache.end()) {
+            return nullptr;
+        }
+        return it->second->data;
+    }
+
+protected:
+    explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {}
+
+    /// @brief Register in the cache a given entry
+    /// @param data Shader to store in the cache
+    /// @param addr Start address of the shader that will be registered
+    /// @param size Size in bytes of the shader
+    void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) {
+        std::scoped_lock lock{invalidation_mutex, lookup_mutex};
+
+        const VAddr addr_end = addr + size;
+        Entry* const entry = NewEntry(addr, addr_end, data.get());
+
+        const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) {
+            invalidation_cache[page].push_back(entry);
+        }
+
+        storage.push_back(std::move(data));
+
+        rasterizer.UpdatePagesCachedCount(addr, size, 1);
+    }
+
+    /// @brief Called when a shader is going to be removed
+    /// @param shader Shader that will be removed
+    /// @pre invalidation_cache is locked
+    /// @pre lookup_mutex is locked
+    virtual void OnShaderRemoval([[maybe_unused]] T* shader) {}
+
+private:
+    /// @brief Invalidate pages in a given region
+    /// @pre invalidation_mutex is locked
+    void InvalidatePagesInRegion(VAddr addr, std::size_t size) {
+        const VAddr addr_end = addr + size;
+        const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) {
+            auto it = invalidation_cache.find(page);
+            if (it == invalidation_cache.end()) {
+                continue;
+            }
+            InvalidatePageEntries(it->second, addr, addr_end);
+        }
+    }
+
+    /// @brief Remove shaders marked for deletion
+    /// @pre invalidation_mutex is locked
+    void RemovePendingShaders() {
+        if (marked_for_removal.empty()) {
+            return;
+        }
+        // Remove duplicates
+        std::sort(marked_for_removal.begin(), marked_for_removal.end());
+        marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()),
+                                 marked_for_removal.end());
+
+        std::vector<T*> removed_shaders;
+        removed_shaders.reserve(marked_for_removal.size());
+
+        std::scoped_lock lock{lookup_mutex};
+
+        for (Entry* const entry : marked_for_removal) {
+            removed_shaders.push_back(entry->data);
+
+            const auto it = lookup_cache.find(entry->addr_start);
+            ASSERT(it != lookup_cache.end());
+            lookup_cache.erase(it);
+        }
+        marked_for_removal.clear();
+
+        if (!removed_shaders.empty()) {
+            RemoveShadersFromStorage(std::move(removed_shaders));
+        }
+    }
+
+    /// @brief Invalidates entries in a given range for the passed page
+    /// @param entries         Vector of entries in the page, it will be modified on overlaps
+    /// @param addr            Start address of the invalidation
+    /// @param addr_end        Non-inclusive end address of the invalidation
+    /// @pre invalidation_mutex is locked
+    void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
+        std::size_t index = 0;
+        while (index < entries.size()) {
+            Entry* const entry = entries[index];
+            if (!entry->Overlaps(addr, addr_end)) {
+                ++index;
+                continue;
+            }
+
+            UnmarkMemory(entry);
+            RemoveEntryFromInvalidationCache(entry);
+            marked_for_removal.push_back(entry);
+        }
+    }
+
+    /// @brief Removes all references to an entry in the invalidation cache
+    /// @param entry Entry to remove from the invalidation cache
+    /// @pre invalidation_mutex is locked
+    void RemoveEntryFromInvalidationCache(const Entry* entry) {
+        const u64 page_end = (entry->addr_end + PAGE_SIZE - 1) >> PAGE_BITS;
+        for (u64 page = entry->addr_start >> PAGE_BITS; page < page_end; ++page) {
+            const auto entries_it = invalidation_cache.find(page);
+            ASSERT(entries_it != invalidation_cache.end());
+            std::vector<Entry*>& entries = entries_it->second;
+
+            const auto entry_it = std::find(entries.begin(), entries.end(), entry);
+            ASSERT(entry_it != entries.end());
+            entries.erase(entry_it);
+        }
+    }
+
+    /// @brief Unmarks an entry from the rasterizer cache
+    /// @param entry Entry to unmark from memory
+    void UnmarkMemory(Entry* entry) {
+        if (!entry->is_memory_marked) {
+            return;
+        }
+        entry->is_memory_marked = false;
+
+        const VAddr addr = entry->addr_start;
+        const std::size_t size = entry->addr_end - addr;
+        rasterizer.UpdatePagesCachedCount(addr, size, -1);
+    }
+
+    /// @brief Removes a vector of shaders from a list
+    /// @param removed_shaders Shaders to be removed from the storage
+    /// @pre invalidation_mutex is locked
+    /// @pre lookup_mutex is locked
+    void RemoveShadersFromStorage(std::vector<T*> removed_shaders) {
+        // Notify removals
+        for (T* const shader : removed_shaders) {
+            OnShaderRemoval(shader);
+        }
+
+        // Remove them from the cache
+        const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) {
+            return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) !=
+                   removed_shaders.end();
+        };
+        storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end());
+    }
+
+    /// @brief Creates a new entry in the lookup cache and returns its pointer
+    /// @pre lookup_mutex is locked
+    Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) {
+        auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
+        Entry* const entry_pointer = entry.get();
+
+        lookup_cache.emplace(addr, std::move(entry));
+        return entry_pointer;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+
+    mutable std::mutex lookup_mutex;
+    std::mutex invalidation_mutex;
+
+    std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache;
+    std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
+    std::vector<std::unique_ptr<T>> storage;
+    std::vector<Entry*> marked_for_removal;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 7032e0059..f476f03b0 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
     ComponentType alpha_component;
     bool is_srgb;
 };
-constexpr std::array<Table, 77> DefinitionTable = {{
+constexpr std::array<Table, 78> DefinitionTable = {{
     {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
     {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
     {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{
     {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
     {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
     {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
+    {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
     {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
 
     {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 715f39d0d..0caf3b4f0 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     }
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
+    if (layer >= params.depth) {
+        return {};
+    }
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
@@ -248,12 +251,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
 
     // Use an extra temporal buffer
     auto& tmp_buffer = staging_cache.GetBuffer(1);
-    // Special case for 3D Texture Segments
-    const bool must_read_current_data =
-        params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
     tmp_buffer.resize(guest_memory_size);
     host_ptr = tmp_buffer.data();
-    if (must_read_current_data) {
+
+    if (params.target == SurfaceTarget::Texture3D) {
+        // Special case for 3D texture segments
         memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
     }
 
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 79e10ffbb..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -217,8 +217,8 @@ public:
     }
 
     bool IsProtected() const {
-        // Only 3D Slices are to be protected
-        return is_target && params.block_depth > 0;
+        // Only 3D slices are to be protected
+        return is_target && params.target == SurfaceTarget::Texture3D;
     }
 
     bool IsRenderTarget() const {
@@ -250,6 +250,11 @@ public:
         return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
     }
 
+    TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
+        return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
+                                  base_level, num_levels));
+    }
+
     std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
                                               const GPUVAddr view_addr,
                                               const std::size_t candidate_size, const u32 mipmap,
@@ -272,8 +277,8 @@ public:
     std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
                                      const std::size_t candidate_size) {
         if (params.target == SurfaceTarget::Texture3D ||
-            (params.num_levels == 1 && !params.is_layered) ||
-            view_params.target == SurfaceTarget::Texture3D) {
+            view_params.target == SurfaceTarget::Texture3D ||
+            (params.num_levels == 1 && !params.is_layered)) {
             return {};
         }
         const auto layer_mipmap{GetLayerMipmap(view_addr)};
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 884fabffe..0b2b2b8c4 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
     params.num_levels = 1;
     params.emulated_levels = 1;
 
-    const bool is_layered = config.layers > 1 && params.block_depth == 0;
-    params.is_layered = is_layered;
-    params.depth = is_layered ? config.layers.Value() : 1;
-    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+    if (config.memory_layout.is_3d != 0) {
+        params.depth = config.layers.Value();
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture3D;
+    } else if (config.layers > 1) {
+        params.depth = config.layers.Value();
+        params.is_layered = true;
+        params.target = SurfaceTarget::Texture2DArray;
+    } else {
+        params.depth = 1;
+        params.is_layered = false;
+        params.target = SurfaceTarget::Texture2D;
+    }
     return params;
 }
 
@@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
     params.width = config.width;
     params.height = config.height;
     params.pitch = config.pitch;
-    // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
+    // TODO(Rodrigo): Try to guess texture arrays from parameters
     params.target = SurfaceTarget::Texture2D;
     params.depth = 1;
     params.num_levels = 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 8bfc541d4..cdcddb225 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <boost/range/iterator_range.hpp>
 
@@ -23,6 +24,7 @@
 #include "core/core.h"
 #include "core/memory.h"
 #include "core/settings.h"
+#include "video_core/compatible_formats.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -46,13 +48,14 @@ class RasterizerInterface;
 
 namespace VideoCommon {
 
+using VideoCore::Surface::FormatCompatibility;
 using VideoCore::Surface::PixelFormat;
-
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
 template <typename TSurface, typename TView>
 class TextureCache {
+    using VectorSurface = boost::container::small_vector<TSurface, 1>;
 
 public:
     void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -246,7 +249,7 @@ public:
             auto& surface = render_targets[index].target;
             surface->MarkAsRenderTarget(false, NO_RT);
             const auto& cr_params = surface->GetSurfaceParams();
-            if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation) {
+            if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
                 AsyncFlushSurface(surface);
             }
         }
@@ -296,30 +299,30 @@ public:
         const GPUVAddr src_gpu_addr = src_config.Address();
         const GPUVAddr dst_gpu_addr = dst_config.Address();
         DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
-        const std::optional<VAddr> dst_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr);
-        const std::optional<VAddr> src_cpu_addr =
-            system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
-        std::pair<TSurface, TView> dst_surface =
-            GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
-        std::pair<TSurface, TView> src_surface =
-            GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
-        ImageBlit(src_surface.second, dst_surface.second, copy_config);
+
+        const auto& memory_manager = system.GPU().MemoryManager();
+        const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
+        const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
+        std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+        TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
+        ImageBlit(src_surface, dst_surface.second, copy_config);
         dst_surface.first->MarkAsModified(true, Tick());
     }
 
-    TSurface TryFindFramebufferSurface(VAddr addr) {
+    TSurface TryFindFramebufferSurface(VAddr addr) const {
         if (!addr) {
             return nullptr;
         }
         const VAddr page = addr >> registry_page_bits;
-        std::vector<TSurface>& list = registry[page];
-        for (auto& surface : list) {
-            if (surface->GetCpuAddr() == addr) {
-                return surface;
-            }
+        const auto it = registry.find(page);
+        if (it == registry.end()) {
+            return nullptr;
         }
-        return nullptr;
+        const auto& list = it->second;
+        const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
+            return surface->GetCpuAddr() == addr;
+        });
+        return found != list.end() ? *found : nullptr;
     }
 
     u64 Tick() {
@@ -498,18 +501,18 @@ private:
      * @param untopological Indicates to the recycler that the texture has no way
      *                      to match the overlaps due to topological reasons.
      **/
-    RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
+    RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
                                  const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
         if (Settings::IsGPULevelExtreme()) {
             return RecycleStrategy::Flush;
         }
         // 3D Textures decision
-        if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) {
+        if (params.target == SurfaceTarget::Texture3D) {
             return RecycleStrategy::Flush;
         }
         for (const auto& s : overlaps) {
             const auto& s_params = s->GetSurfaceParams();
-            if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) {
+            if (s_params.target == SurfaceTarget::Texture3D) {
                 return RecycleStrategy::Flush;
             }
         }
@@ -538,9 +541,8 @@ private:
      * @param untopological     Indicates to the recycler that the texture has no way to match the
      *                          overlaps due to topological reasons.
      **/
-    std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
-                                              const SurfaceParams& params, const GPUVAddr gpu_addr,
-                                              const bool preserve_contents,
+    std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
+                                              const GPUVAddr gpu_addr, const bool preserve_contents,
                                               const MatchTopologyResult untopological) {
         const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
         for (auto& surface : overlaps) {
@@ -594,7 +596,7 @@ private:
         } else {
             new_surface = GetUncachedSurface(gpu_addr, params);
         }
-        const auto& final_params = new_surface->GetSurfaceParams();
+        const SurfaceParams& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
             if (Settings::IsGPULevelExtreme()) {
                 BufferCopy(current_surface, new_surface);
@@ -602,7 +604,7 @@ private:
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
-                ImageCopy(current_surface, new_surface, brick);
+                TryCopyImage(current_surface, new_surface, brick);
             }
         }
         Unregister(current_surface);
@@ -650,47 +652,65 @@ private:
      * @param params   The parameters on the new surface.
      * @param gpu_addr The starting address of the new surface.
      **/
-    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
                                                                     const SurfaceParams& params,
-                                                                    const GPUVAddr gpu_addr) {
+                                                                    GPUVAddr gpu_addr) {
         if (params.target == SurfaceTarget::Texture3D) {
-            return {};
+            return std::nullopt;
         }
-        bool modified = false;
+        const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
         TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-        u32 passed_tests = 0;
+
+        if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
+            LoadSurface(new_surface);
+            for (const auto& surface : overlaps) {
+                Unregister(surface);
+            }
+            Register(new_surface);
+            return {{new_surface, new_surface->GetMainView()}};
+        }
+
+        std::size_t passed_tests = 0;
         for (auto& surface : overlaps) {
             const SurfaceParams& src_params = surface->GetSurfaceParams();
-            if (src_params.is_layered || src_params.num_levels > 1) {
-                // We send this cases to recycle as they are more complex to handle
-                return {};
-            }
-            const std::size_t candidate_size = surface->GetSizeInBytes();
-            auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
+            const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
             if (!mipmap_layer) {
                 continue;
             }
-            const auto [layer, mipmap] = *mipmap_layer;
-            if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
+            const auto [base_layer, base_mipmap] = *mipmap_layer;
+            if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
                 continue;
             }
-            modified |= surface->IsModified();
-            // Now we got all the data set up
-            const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
-            const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
-            const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
-            passed_tests++;
-            ImageCopy(surface, new_surface, copy_params);
+            ++passed_tests;
+
+            // Copy all mipmaps and layers
+            const u32 block_width = params.GetDefaultBlockWidth();
+            const u32 block_height = params.GetDefaultBlockHeight();
+            for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
+                const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
+                const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
+                if (width < block_width || height < block_height) {
+                    // Current APIs forbid copying small compressed textures, avoid errors
+                    break;
+                }
+                const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
+                                             src_params.depth);
+                TryCopyImage(surface, new_surface, copy_params);
+            }
         }
         if (passed_tests == 0) {
-            return {};
+            return std::nullopt;
+        }
+        if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
             // In Accurate GPU all tests should pass, else we recycle
-        } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
-            return {};
+            return std::nullopt;
         }
+
+        const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
         for (const auto& surface : overlaps) {
             Unregister(surface);
         }
+
         new_surface->MarkAsModified(modified, Tick());
         Register(new_surface);
         return {{new_surface, new_surface->GetMainView()}};
@@ -708,53 +728,11 @@ private:
      * @param preserve_contents Indicates that the new surface should be loaded from memory or
      *                          left blank.
      */
-    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
                                                                const SurfaceParams& params,
-                                                               const GPUVAddr gpu_addr,
-                                                               const VAddr cpu_addr,
+                                                               GPUVAddr gpu_addr, VAddr cpu_addr,
                                                                bool preserve_contents) {
-        if (params.target == SurfaceTarget::Texture3D) {
-            bool failed = false;
-            if (params.num_levels > 1) {
-                // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
-                return std::nullopt;
-            }
-            TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-            bool modified = false;
-            for (auto& surface : overlaps) {
-                const SurfaceParams& src_params = surface->GetSurfaceParams();
-                if (src_params.target != SurfaceTarget::Texture2D) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.height != params.height) {
-                    failed = true;
-                    break;
-                }
-                if (src_params.block_depth != params.block_depth ||
-                    src_params.block_height != params.block_height) {
-                    failed = true;
-                    break;
-                }
-                const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
-                const auto offsets = params.GetBlockOffsetXYZ(offset);
-                const auto z = std::get<2>(offsets);
-                modified |= surface->IsModified();
-                const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
-                                             1);
-                ImageCopy(surface, new_surface, copy_params);
-            }
-            if (failed) {
-                return std::nullopt;
-            }
-            for (const auto& surface : overlaps) {
-                Unregister(surface);
-            }
-            new_surface->MarkAsModified(modified, Tick());
-            Register(new_surface);
-            auto view = new_surface->GetMainView();
-            return {{std::move(new_surface), view}};
-        } else {
+        if (params.target != SurfaceTarget::Texture3D) {
             for (const auto& surface : overlaps) {
                 if (!surface->MatchTarget(params.target)) {
                     if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
@@ -770,11 +748,60 @@ private:
                     continue;
                 }
                 if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
-                    return {{surface, surface->GetMainView()}};
+                    return std::make_pair(surface, surface->GetMainView());
                 }
             }
             return InitializeSurface(gpu_addr, params, preserve_contents);
         }
+
+        if (params.num_levels > 1) {
+            // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
+            return std::nullopt;
+        }
+
+        if (overlaps.size() == 1) {
+            const auto& surface = overlaps[0];
+            const SurfaceParams& overlap_params = surface->GetSurfaceParams();
+            // Don't attempt to render to textures with more than one level for now
+            // The texture has to be to the right or the sample address if we want to render to it
+            if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
+                const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
+                const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+                if (slice < overlap_params.depth) {
+                    auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
+                    return std::make_pair(std::move(surface), std::move(view));
+                }
+            }
+        }
+
+        TSurface new_surface = GetUncachedSurface(gpu_addr, params);
+        bool modified = false;
+
+        for (auto& surface : overlaps) {
+            const SurfaceParams& src_params = surface->GetSurfaceParams();
+            if (src_params.target != SurfaceTarget::Texture2D ||
+                src_params.height != params.height ||
+                src_params.block_depth != params.block_depth ||
+                src_params.block_height != params.block_height) {
+                return std::nullopt;
+            }
+            modified |= surface->IsModified();
+
+            const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
+            const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+            const u32 width = params.width;
+            const u32 height = params.height;
+            const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
+            TryCopyImage(surface, new_surface, copy_params);
+        }
+        for (const auto& surface : overlaps) {
+            Unregister(surface);
+        }
+        new_surface->MarkAsModified(modified, Tick());
+        Register(new_surface);
+
+        TView view = new_surface->GetMainView();
+        return std::make_pair(std::move(new_surface), std::move(view));
     }
 
     /**
@@ -810,7 +837,7 @@ private:
             TSurface& current_surface = iter->second;
             const auto topological_result = current_surface->MatchesTopology(params);
             if (topological_result != MatchTopologyResult::FullMatch) {
-                std::vector<TSurface> overlaps{current_surface};
+                VectorSurface overlaps{current_surface};
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       topological_result);
             }
@@ -852,7 +879,7 @@ private:
             }
         }
 
-        // Check if it's a 3D texture
+        // Manage 3D textures
         if (params.block_depth > 0) {
             auto surface =
                 Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
@@ -868,12 +895,9 @@ private:
             // two things either the candidate surface is a supertexture of the overlap
             // or they don't match in any known way.
             if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
-                if (current_surface->GetGpuAddr() == gpu_addr) {
-                    std::optional<std::pair<TSurface, TView>> view =
-                        TryReconstructSurface(overlaps, params, gpu_addr);
-                    if (view) {
-                        return *view;
-                    }
+                const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
+                if (view) {
+                    return *view;
                 }
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       MatchTopologyResult::FullMatch);
@@ -1030,7 +1054,7 @@ private:
     void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,
                         const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {
         auto deduced_src = DeduceSurface(src_gpu_addr, src_params);
-        auto deduced_dst = DeduceSurface(src_gpu_addr, src_params);
+        auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);
         if (deduced_src.Failed() || deduced_dst.Failed()) {
             return;
         }
@@ -1126,23 +1150,25 @@ private:
         }
     }
 
-    std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
+    VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
         if (size == 0) {
             return {};
         }
         const VAddr cpu_addr_end = cpu_addr + size;
-        VAddr start = cpu_addr >> registry_page_bits;
         const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
-        std::vector<TSurface> surfaces;
-        while (start <= end) {
-            std::vector<TSurface>& list = registry[start];
-            for (auto& surface : list) {
-                if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
-                    surface->MarkAsPicked(true);
-                    surfaces.push_back(surface);
+        VectorSurface surfaces;
+        for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
+            const auto it = registry.find(start);
+            if (it == registry.end()) {
+                continue;
+            }
+            for (auto& surface : it->second) {
+                if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
+                    continue;
                 }
+                surface->MarkAsPicked(true);
+                surfaces.push_back(surface);
             }
-            start++;
         }
         for (auto& surface : surfaces) {
             surface->MarkAsPicked(false);
@@ -1167,6 +1193,19 @@ private:
         return {};
     }
 
+    /// Try to do an image copy logging when formats are incompatible.
+    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+        const SurfaceParams& src_params = src->GetSurfaceParams();
+        const SurfaceParams& dst_params = dst->GetSurfaceParams();
+        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+                      static_cast<int>(dst_params.pixel_format),
+                      static_cast<int>(src_params.pixel_format));
+            return;
+        }
+        ImageCopy(src, dst, copy);
+    }
+
     constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
         return siblings_table[static_cast<std::size_t>(format)];
     }
@@ -1216,6 +1255,7 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
 
     FormatLookupTable format_lookup_table;
+    FormatCompatibility format_compatibility;
 
     u64 ticks{};
 
diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp
index d1939d744..4171e3ef2 100644
--- a/src/video_core/textures/texture.cpp
+++ b/src/video_core/textures/texture.cpp
@@ -48,7 +48,7 @@ constexpr std::array<float, 256> SRGB_CONVERSION_LUT = {
 };
 
 unsigned SettingsMinimumAnisotropy() noexcept {
-    switch (static_cast<Anisotropy>(Settings::values.max_anisotropy)) {
+    switch (static_cast<Anisotropy>(Settings::values.max_anisotropy.GetValue())) {
     default:
     case Anisotropy::Default:
         return 1U;
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index f60bdc60a..45f360bdd 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -19,7 +19,7 @@ namespace {
 std::unique_ptr<VideoCore::RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
                                                         Core::System& system,
                                                         Core::Frontend::GraphicsContext& context) {
-    switch (Settings::values.renderer_backend) {
+    switch (Settings::values.renderer_backend.GetValue()) {
     case Settings::RendererBackend::OpenGL:
         return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system, context);
 #ifdef HAS_VULKAN
@@ -42,7 +42,7 @@ std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor
         return nullptr;
     }
 
-    if (Settings::values.use_asynchronous_gpu_emulation) {
+    if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
         return std::make_unique<VideoCommon::GPUAsynch>(system, std::move(renderer),
                                                         std::move(context));
     }
@@ -51,8 +51,8 @@ std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor
 
 u16 GetResolutionScaleFactor(const RendererBase& renderer) {
     return static_cast<u16>(
-        Settings::values.resolution_factor != 0
-            ? Settings::values.resolution_factor
+        Settings::values.resolution_factor.GetValue() != 0
+            ? Settings::values.resolution_factor.GetValue()
             : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio());
 }
 
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 8b9404718..6b25a7fa0 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -24,6 +24,8 @@ add_executable(yuzu
     compatibility_list.h
     configuration/config.cpp
     configuration/config.h
+    configuration/configuration_shared.cpp
+    configuration/configuration_shared.h
     configuration/configure.ui
     configuration/configure_audio.cpp
     configuration/configure_audio.h
@@ -60,9 +62,12 @@ add_executable(yuzu
     configuration/configure_mouse_advanced.cpp
     configuration/configure_mouse_advanced.h
     configuration/configure_mouse_advanced.ui
-    configuration/configure_per_general.cpp
-    configuration/configure_per_general.h
-    configuration/configure_per_general.ui
+    configuration/configure_per_game.cpp
+    configuration/configure_per_game.h
+    configuration/configure_per_game.ui
+    configuration/configure_per_game_addons.cpp
+    configuration/configure_per_game_addons.h
+    configuration/configure_per_game_addons.ui
     configuration/configure_profile_manager.cpp
     configuration/configure_profile_manager.h
     configuration/configure_profile_manager.ui
@@ -147,7 +152,7 @@ endif()
 create_target_directory_groups(yuzu)
 
 target_link_libraries(yuzu PRIVATE common core input_common video_core)
-target_link_libraries(yuzu PRIVATE Boost::boost glad Qt5::OpenGL Qt5::Widgets)
+target_link_libraries(yuzu PRIVATE Boost::boost glad Qt5::Widgets)
 target_link_libraries(yuzu PRIVATE ${PLATFORM_LIBRARIES} Threads::Threads)
 
 if (ENABLE_VULKAN AND NOT WIN32)
@@ -208,6 +213,10 @@ if (MSVC)
     copy_yuzu_unicorn_deps(yuzu)
 endif()
 
+if (NOT APPLE)
+    target_compile_definitions(yuzu PRIVATE HAS_OPENGL)
+endif()
+
 if (ENABLE_VULKAN)
     target_include_directories(yuzu PRIVATE ../../externals/Vulkan-Headers/include)
     target_compile_definitions(yuzu PRIVATE HAS_VULKAN)
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 1adf8932b..5738787ac 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -8,13 +8,16 @@
 #include <QHBoxLayout>
 #include <QKeyEvent>
 #include <QMessageBox>
-#include <QOffscreenSurface>
-#include <QOpenGLContext>
 #include <QPainter>
 #include <QScreen>
 #include <QStringList>
 #include <QWindow>
 
+#ifdef HAS_OPENGL
+#include <QOffscreenSurface>
+#include <QOpenGLContext>
+#endif
+
 #if !defined(WIN32) && HAS_VULKAN
 #include <qpa/qplatformnativeinterface.h>
 #endif
@@ -41,49 +44,65 @@ EmuThread::EmuThread() = default;
 EmuThread::~EmuThread() = default;
 
 void EmuThread::run() {
-    MicroProfileOnThreadCreate("EmuThread");
+    std::string name = "yuzu:EmuControlThread";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+
+    auto& system = Core::System::GetInstance();
+
+    system.RegisterHostThread();
+
+    auto& gpu = system.GPU();
 
     // Main process has been loaded. Make the context current to this thread and begin GPU and CPU
     // execution.
-    Core::System::GetInstance().GPU().Start();
+    gpu.Start();
+
+    gpu.ObtainContext();
 
     emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0);
 
-    Core::System::GetInstance().Renderer().Rasterizer().LoadDiskResources(
+    system.Renderer().Rasterizer().LoadDiskResources(
         stop_run, [this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) {
             emit LoadProgress(stage, value, total);
         });
 
     emit LoadProgress(VideoCore::LoadCallbackStage::Complete, 0, 0);
 
+    gpu.ReleaseContext();
+
     // Holds whether the cpu was running during the last iteration,
     // so that the DebugModeLeft signal can be emitted before the
     // next execution step
     bool was_active = false;
     while (!stop_run) {
         if (running) {
-            if (!was_active)
+            if (was_active) {
                 emit DebugModeLeft();
+            }
 
-            Core::System::ResultStatus result = Core::System::GetInstance().RunLoop();
+            running_guard = true;
+            Core::System::ResultStatus result = system.Run();
             if (result != Core::System::ResultStatus::Success) {
+                running_guard = false;
                 this->SetRunning(false);
-                emit ErrorThrown(result, Core::System::GetInstance().GetStatusDetails());
+                emit ErrorThrown(result, system.GetStatusDetails());
             }
+            running_wait.Wait();
+            result = system.Pause();
+            if (result != Core::System::ResultStatus::Success) {
+                running_guard = false;
+                this->SetRunning(false);
+                emit ErrorThrown(result, system.GetStatusDetails());
+            }
+            running_guard = false;
 
-            was_active = running || exec_step;
-            if (!was_active && !stop_run)
+            if (!stop_run) {
+                was_active = true;
                 emit DebugModeEntered();
+            }
         } else if (exec_step) {
-            if (!was_active)
-                emit DebugModeLeft();
-
-            exec_step = false;
-            Core::System::GetInstance().SingleStep();
-            emit DebugModeEntered();
-            yieldCurrentThread();
-
-            was_active = false;
+            UNIMPLEMENTED();
         } else {
             std::unique_lock lock{running_mutex};
             running_cv.wait(lock, [this] { return IsRunning() || exec_step || stop_run; });
@@ -91,13 +110,14 @@ void EmuThread::run() {
     }
 
     // Shutdown the core emulation
-    Core::System::GetInstance().Shutdown();
+    system.Shutdown();
 
 #if MICROPROFILE_ENABLED
     MicroProfileOnThreadExit();
 #endif
 }
 
+#ifdef HAS_OPENGL
 class OpenGLSharedContext : public Core::Frontend::GraphicsContext {
 public:
     /// Create the original context that should be shared from
@@ -106,6 +126,9 @@ public:
         format.setVersion(4, 3);
         format.setProfile(QSurfaceFormat::CompatibilityProfile);
         format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
+        if (Settings::values.renderer_debug) {
+            format.setOption(QSurfaceFormat::FormatOption::DebugContext);
+        }
         // TODO: expose a setting for buffer value (ie default/single/double/triple)
         format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
         format.setSwapInterval(0);
@@ -122,7 +145,7 @@ public:
 
         // disable vsync for any shared contexts
         auto format = share_context->format();
-        format.setSwapInterval(main_surface ? Settings::values.use_vsync : 0);
+        format.setSwapInterval(main_surface ? Settings::values.use_vsync.GetValue() : 0);
 
         context = std::make_unique<QOpenGLContext>();
         context->setShareContext(share_context);
@@ -180,6 +203,7 @@ private:
     std::unique_ptr<QOffscreenSurface> offscreen_surface{};
     QSurface* surface;
 };
+#endif
 
 class DummyContext : public Core::Frontend::GraphicsContext {};
 
@@ -352,7 +376,7 @@ QByteArray GRenderWindow::saveGeometry() {
 }
 
 qreal GRenderWindow::windowPixelRatio() const {
-    return devicePixelRatio();
+    return devicePixelRatioF();
 }
 
 std::pair<u32, u32> GRenderWindow::ScaleTouch(const QPointF& pos) const {
@@ -470,13 +494,15 @@ void GRenderWindow::resizeEvent(QResizeEvent* event) {
 }
 
 std::unique_ptr<Core::Frontend::GraphicsContext> GRenderWindow::CreateSharedContext() const {
-    if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) {
+#ifdef HAS_OPENGL
+    if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL) {
         auto c = static_cast<OpenGLSharedContext*>(main_context.get());
         // Bind the shared contexts to the main surface in case the backend wants to take over
         // presentation
         return std::make_unique<OpenGLSharedContext>(c->GetShareContext(),
                                                      child_widget->windowHandle());
     }
+#endif
     return std::make_unique<DummyContext>();
 }
 
@@ -485,7 +511,7 @@ bool GRenderWindow::InitRenderTarget() {
 
     first_frame = false;
 
-    switch (Settings::values.renderer_backend) {
+    switch (Settings::values.renderer_backend.GetValue()) {
     case Settings::RendererBackend::OpenGL:
         if (!InitializeOpenGL()) {
             return false;
@@ -512,7 +538,7 @@ bool GRenderWindow::InitRenderTarget() {
     OnFramebufferSizeChanged();
     BackupGeometry();
 
-    if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) {
+    if (Settings::values.renderer_backend.GetValue() == Settings::RendererBackend::OpenGL) {
         if (!LoadOpenGL()) {
             return false;
         }
@@ -557,6 +583,7 @@ void GRenderWindow::OnMinimalClientAreaChangeRequest(std::pair<u32, u32> minimal
 }
 
 bool GRenderWindow::InitializeOpenGL() {
+#ifdef HAS_OPENGL
     // TODO: One of these flags might be interesting: WA_OpaquePaintEvent, WA_NoBackground,
     // WA_DontShowOnScreen, WA_DeleteOnClose
     auto child = new OpenGLRenderWidget(this);
@@ -568,6 +595,11 @@ bool GRenderWindow::InitializeOpenGL() {
         std::make_unique<OpenGLSharedContext>(context->GetShareContext(), child->windowHandle()));
 
     return true;
+#else
+    QMessageBox::warning(this, tr("OpenGL not available!"),
+                         tr("yuzu has not been compiled with OpenGL support."));
+    return false;
+#endif
 }
 
 bool GRenderWindow::InitializeVulkan() {
diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h
index 3626604ca..6c59b4d5c 100644
--- a/src/yuzu/bootmanager.h
+++ b/src/yuzu/bootmanager.h
@@ -59,6 +59,12 @@ public:
         this->running = running;
         lock.unlock();
         running_cv.notify_all();
+        if (!running) {
+            running_wait.Set();
+            /// Wait until effectively paused
+            while (running_guard)
+                ;
+        }
     }
 
     /**
@@ -84,6 +90,8 @@ private:
     std::atomic_bool stop_run{false};
     std::mutex running_mutex;
     std::condition_variable running_cv;
+    Common::Event running_wait{};
+    std::atomic_bool running_guard{false};
 
 signals:
     /**
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index b08b87426..1b2b1b2bb 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -13,17 +13,20 @@
 #include "input_common/udp/client.h"
 #include "yuzu/configuration/config.h"
 
-Config::Config() {
+Config::Config(const std::string& config_file, bool is_global) {
     // TODO: Don't hardcode the path; let the frontend decide where to put the config files.
-    qt_config_loc = FileUtil::GetUserPath(FileUtil::UserPath::ConfigDir) + "qt-config.ini";
+    qt_config_loc = FileUtil::GetUserPath(FileUtil::UserPath::ConfigDir) + config_file;
     FileUtil::CreateFullPath(qt_config_loc);
     qt_config =
         std::make_unique<QSettings>(QString::fromStdString(qt_config_loc), QSettings::IniFormat);
+    global = is_global;
     Reload();
 }
 
 Config::~Config() {
-    Save();
+    if (global) {
+        Save();
+    }
 }
 
 const std::array<int, Settings::NativeButton::NumButtons> Config::default_buttons = {
@@ -211,8 +214,8 @@ const std::array<int, Settings::NativeKeyboard::NumKeyboardMods> Config::default
 // This must be in alphabetical order according to action name as it must have the same order as
 // UISetting::values.shortcuts, which is alphabetically ordered.
 // clang-format off
-const std::array<UISettings::Shortcut, 15> Config::default_hotkeys{{
-    {QStringLiteral("Capture Screenshot"),       QStringLiteral("Main Window"), {QStringLiteral("Ctrl+P"), Qt::ApplicationShortcut}},
+const std::array<UISettings::Shortcut, 16> Config::default_hotkeys{{
+    {QStringLiteral("Capture Screenshot"),       QStringLiteral("Main Window"), {QStringLiteral("Ctrl+P"), Qt::WidgetWithChildrenShortcut}},
     {QStringLiteral("Change Docked Mode"),       QStringLiteral("Main Window"), {QStringLiteral("F10"), Qt::ApplicationShortcut}},
     {QStringLiteral("Continue/Pause Emulation"), QStringLiteral("Main Window"), {QStringLiteral("F4"), Qt::WindowShortcut}},
     {QStringLiteral("Decrease Speed Limit"),     QStringLiteral("Main Window"), {QStringLiteral("-"), Qt::ApplicationShortcut}},
@@ -220,8 +223,9 @@ const std::array<UISettings::Shortcut, 15> Config::default_hotkeys{{
     {QStringLiteral("Exit yuzu"),                QStringLiteral("Main Window"), {QStringLiteral("Ctrl+Q"), Qt::WindowShortcut}},
     {QStringLiteral("Fullscreen"),               QStringLiteral("Main Window"), {QStringLiteral("F11"), Qt::WindowShortcut}},
     {QStringLiteral("Increase Speed Limit"),     QStringLiteral("Main Window"), {QStringLiteral("+"), Qt::ApplicationShortcut}},
-    {QStringLiteral("Load Amiibo"),              QStringLiteral("Main Window"), {QStringLiteral("F2"), Qt::ApplicationShortcut}},
-    {QStringLiteral("Load File"),                QStringLiteral("Main Window"), {QStringLiteral("Ctrl+O"), Qt::WindowShortcut}},
+    {QStringLiteral("Load Amiibo"),              QStringLiteral("Main Window"), {QStringLiteral("F2"), Qt::WidgetWithChildrenShortcut}},
+    {QStringLiteral("Load File"),                QStringLiteral("Main Window"), {QStringLiteral("Ctrl+O"), Qt::WidgetWithChildrenShortcut}},
+    {QStringLiteral("Mute Audio"),               QStringLiteral("Main Window"), {QStringLiteral("Ctrl+M"), Qt::WindowShortcut}},
     {QStringLiteral("Restart Emulation"),        QStringLiteral("Main Window"), {QStringLiteral("F6"), Qt::WindowShortcut}},
     {QStringLiteral("Stop Emulation"),           QStringLiteral("Main Window"), {QStringLiteral("F5"), Qt::WindowShortcut}},
     {QStringLiteral("Toggle Filter Bar"),        QStringLiteral("Main Window"), {QStringLiteral("Ctrl+F"), Qt::WindowShortcut}},
@@ -401,16 +405,19 @@ void Config::ApplyDefaultProfileIfInputInvalid() {
 void Config::ReadAudioValues() {
     qt_config->beginGroup(QStringLiteral("Audio"));
 
-    Settings::values.sink_id = ReadSetting(QStringLiteral("output_engine"), QStringLiteral("auto"))
-                                   .toString()
-                                   .toStdString();
-    Settings::values.enable_audio_stretching =
-        ReadSetting(QStringLiteral("enable_audio_stretching"), true).toBool();
-    Settings::values.audio_device_id =
-        ReadSetting(QStringLiteral("output_device"), QStringLiteral("auto"))
-            .toString()
-            .toStdString();
-    Settings::values.volume = ReadSetting(QStringLiteral("volume"), 1).toFloat();
+    if (global) {
+        Settings::values.sink_id =
+            ReadSetting(QStringLiteral("output_engine"), QStringLiteral("auto"))
+                .toString()
+                .toStdString();
+        Settings::values.audio_device_id =
+            ReadSetting(QStringLiteral("output_device"), QStringLiteral("auto"))
+                .toString()
+                .toStdString();
+    }
+    ReadSettingGlobal(Settings::values.enable_audio_stretching,
+                      QStringLiteral("enable_audio_stretching"), true);
+    ReadSettingGlobal(Settings::values.volume, QStringLiteral("volume"), 1);
 
     qt_config->endGroup();
 }
@@ -439,6 +446,8 @@ void Config::ReadControlValues() {
             .toInt());
     Settings::values.udp_pad_index =
         static_cast<u8>(ReadSetting(QStringLiteral("udp_pad_index"), 0).toUInt());
+    Settings::values.use_docked_mode =
+        ReadSetting(QStringLiteral("use_docked_mode"), false).toBool();
 
     qt_config->endGroup();
 }
@@ -446,7 +455,7 @@ void Config::ReadControlValues() {
 void Config::ReadCoreValues() {
     qt_config->beginGroup(QStringLiteral("Core"));
 
-    Settings::values.use_multi_core = ReadSetting(QStringLiteral("use_multi_core"), false).toBool();
+    ReadSettingGlobal(Settings::values.use_multi_core, QStringLiteral("use_multi_core"), false);
 
     qt_config->endGroup();
 }
@@ -533,6 +542,8 @@ void Config::ReadDebuggingValues() {
     Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
     Settings::values.disable_cpu_opt =
         ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
+    Settings::values.disable_macro_jit =
+        ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();
 
     qt_config->endGroup();
 }
@@ -625,34 +636,28 @@ void Config::ReadPathValues() {
 void Config::ReadRendererValues() {
     qt_config->beginGroup(QStringLiteral("Renderer"));
 
-    Settings::values.renderer_backend =
-        static_cast<Settings::RendererBackend>(ReadSetting(QStringLiteral("backend"), 0).toInt());
-    Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool();
-    Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt();
-    Settings::values.resolution_factor =
-        ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat();
-    Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt();
-    Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt();
-    Settings::values.use_frame_limit =
-        ReadSetting(QStringLiteral("use_frame_limit"), true).toBool();
-    Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt();
-    Settings::values.use_disk_shader_cache =
-        ReadSetting(QStringLiteral("use_disk_shader_cache"), true).toBool();
-    const int gpu_accuracy_level = ReadSetting(QStringLiteral("gpu_accuracy"), 0).toInt();
-    Settings::values.gpu_accuracy = static_cast<Settings::GPUAccuracy>(gpu_accuracy_level);
-    Settings::values.use_asynchronous_gpu_emulation =
-        ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
-    Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
-    Settings::values.use_assembly_shaders =
-        ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
-    Settings::values.use_fast_gpu_time =
-        ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
-    Settings::values.force_30fps_mode =
-        ReadSetting(QStringLiteral("force_30fps_mode"), false).toBool();
-
-    Settings::values.bg_red = ReadSetting(QStringLiteral("bg_red"), 0.0).toFloat();
-    Settings::values.bg_green = ReadSetting(QStringLiteral("bg_green"), 0.0).toFloat();
-    Settings::values.bg_blue = ReadSetting(QStringLiteral("bg_blue"), 0.0).toFloat();
+    ReadSettingGlobal(Settings::values.renderer_backend, QStringLiteral("backend"), 0);
+    ReadSettingGlobal(Settings::values.renderer_debug, QStringLiteral("debug"), false);
+    ReadSettingGlobal(Settings::values.vulkan_device, QStringLiteral("vulkan_device"), 0);
+    ReadSettingGlobal(Settings::values.aspect_ratio, QStringLiteral("aspect_ratio"), 0);
+    ReadSettingGlobal(Settings::values.max_anisotropy, QStringLiteral("max_anisotropy"), 0);
+    ReadSettingGlobal(Settings::values.use_frame_limit, QStringLiteral("use_frame_limit"), true);
+    ReadSettingGlobal(Settings::values.frame_limit, QStringLiteral("frame_limit"), 100);
+    ReadSettingGlobal(Settings::values.use_disk_shader_cache,
+                      QStringLiteral("use_disk_shader_cache"), true);
+    ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0);
+    ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,
+                      QStringLiteral("use_asynchronous_gpu_emulation"), false);
+    ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
+    ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
+                      false);
+    ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"),
+                      true);
+    ReadSettingGlobal(Settings::values.force_30fps_mode, QStringLiteral("force_30fps_mode"), false);
+
+    ReadSettingGlobal(Settings::values.bg_red, QStringLiteral("bg_red"), 0.0);
+    ReadSettingGlobal(Settings::values.bg_green, QStringLiteral("bg_green"), 0.0);
+    ReadSettingGlobal(Settings::values.bg_blue, QStringLiteral("bg_blue"), 0.0);
 
     qt_config->endGroup();
 }
@@ -664,11 +669,13 @@ void Config::ReadShortcutValues() {
         const auto& [keyseq, context] = shortcut;
         qt_config->beginGroup(group);
         qt_config->beginGroup(name);
+        // No longer using ReadSetting for shortcut.second as it innacurately returns a value of 1
+        // for WidgetWithChildrenShortcut which is a value of 3. Needed to fix shortcuts the open
+        // a file dialog in windowed mode
         UISettings::values.shortcuts.push_back(
             {name,
              group,
-             {ReadSetting(QStringLiteral("KeySeq"), keyseq).toString(),
-              ReadSetting(QStringLiteral("Context"), context).toInt()}});
+             {ReadSetting(QStringLiteral("KeySeq"), keyseq).toString(), shortcut.second}});
         qt_config->endGroup();
         qt_config->endGroup();
     }
@@ -679,35 +686,45 @@ void Config::ReadShortcutValues() {
 void Config::ReadSystemValues() {
     qt_config->beginGroup(QStringLiteral("System"));
 
-    Settings::values.use_docked_mode =
-        ReadSetting(QStringLiteral("use_docked_mode"), false).toBool();
-
-    Settings::values.current_user = std::clamp<int>(
-        ReadSetting(QStringLiteral("current_user"), 0).toInt(), 0, Service::Account::MAX_USERS - 1);
+    ReadSettingGlobal(Settings::values.current_user, QStringLiteral("current_user"), 0);
+    Settings::values.current_user =
+        std::clamp<int>(Settings::values.current_user, 0, Service::Account::MAX_USERS - 1);
 
-    Settings::values.language_index = ReadSetting(QStringLiteral("language_index"), 1).toInt();
+    ReadSettingGlobal(Settings::values.language_index, QStringLiteral("language_index"), 1);
 
-    Settings::values.region_index = ReadSetting(QStringLiteral("region_index"), 1).toInt();
+    ReadSettingGlobal(Settings::values.region_index, QStringLiteral("region_index"), 1);
 
-    Settings::values.time_zone_index = ReadSetting(QStringLiteral("time_zone_index"), 0).toInt();
+    ReadSettingGlobal(Settings::values.time_zone_index, QStringLiteral("time_zone_index"), 0);
 
-    const auto rng_seed_enabled = ReadSetting(QStringLiteral("rng_seed_enabled"), false).toBool();
-    if (rng_seed_enabled) {
-        Settings::values.rng_seed = ReadSetting(QStringLiteral("rng_seed"), 0).toULongLong();
-    } else {
-        Settings::values.rng_seed = std::nullopt;
+    bool rng_seed_enabled;
+    ReadSettingGlobal(rng_seed_enabled, QStringLiteral("rng_seed_enabled"), false);
+    bool rng_seed_global =
+        global || qt_config->value(QStringLiteral("rng_seed/use_global"), true).toBool();
+    Settings::values.rng_seed.SetGlobal(rng_seed_global);
+    if (global || !rng_seed_global) {
+        if (rng_seed_enabled) {
+            Settings::values.rng_seed.SetValue(
+                ReadSetting(QStringLiteral("rng_seed"), 0).toULongLong());
+        } else {
+            Settings::values.rng_seed.SetValue(std::nullopt);
+        }
     }
 
-    const auto custom_rtc_enabled =
-        ReadSetting(QStringLiteral("custom_rtc_enabled"), false).toBool();
-    if (custom_rtc_enabled) {
-        Settings::values.custom_rtc =
-            std::chrono::seconds(ReadSetting(QStringLiteral("custom_rtc"), 0).toULongLong());
-    } else {
-        Settings::values.custom_rtc = std::nullopt;
+    bool custom_rtc_enabled;
+    ReadSettingGlobal(custom_rtc_enabled, QStringLiteral("custom_rtc_enabled"), false);
+    bool custom_rtc_global =
+        global || qt_config->value(QStringLiteral("custom_rtc/use_global"), true).toBool();
+    Settings::values.custom_rtc.SetGlobal(custom_rtc_global);
+    if (global || !custom_rtc_global) {
+        if (custom_rtc_enabled) {
+            Settings::values.custom_rtc.SetValue(
+                std::chrono::seconds(ReadSetting(QStringLiteral("custom_rtc"), 0).toULongLong()));
+        } else {
+            Settings::values.custom_rtc.SetValue(std::nullopt);
+        }
     }
 
-    Settings::values.sound_index = ReadSetting(QStringLiteral("sound_index"), 1).toInt();
+    ReadSettingGlobal(Settings::values.sound_index, QStringLiteral("sound_index"), 1);
 
     qt_config->endGroup();
 }
@@ -720,8 +737,6 @@ void Config::ReadUIValues() {
             .toString();
     UISettings::values.enable_discord_presence =
         ReadSetting(QStringLiteral("enable_discord_presence"), true).toBool();
-    UISettings::values.screenshot_resolution_factor =
-        static_cast<u16>(ReadSetting(QStringLiteral("screenshot_resolution_factor"), 0).toUInt());
     UISettings::values.select_user_on_boot =
         ReadSetting(QStringLiteral("select_user_on_boot"), false).toBool();
 
@@ -803,18 +818,20 @@ void Config::ReadWebServiceValues() {
 }
 
 void Config::ReadValues() {
-    ReadControlValues();
+    if (global) {
+        ReadControlValues();
+        ReadDataStorageValues();
+        ReadDebuggingValues();
+        ReadDisabledAddOnValues();
+        ReadServiceValues();
+        ReadUIValues();
+        ReadWebServiceValues();
+        ReadMiscellaneousValues();
+    }
     ReadCoreValues();
     ReadRendererValues();
     ReadAudioValues();
-    ReadDataStorageValues();
     ReadSystemValues();
-    ReadMiscellaneousValues();
-    ReadDebuggingValues();
-    ReadWebServiceValues();
-    ReadServiceValues();
-    ReadDisabledAddOnValues();
-    ReadUIValues();
 }
 
 void Config::SavePlayerValues() {
@@ -901,30 +918,35 @@ void Config::SaveTouchscreenValues() {
 }
 
 void Config::SaveValues() {
-    SaveControlValues();
+    if (global) {
+        SaveControlValues();
+        SaveDataStorageValues();
+        SaveDebuggingValues();
+        SaveDisabledAddOnValues();
+        SaveServiceValues();
+        SaveUIValues();
+        SaveWebServiceValues();
+        SaveMiscellaneousValues();
+    }
     SaveCoreValues();
     SaveRendererValues();
     SaveAudioValues();
-    SaveDataStorageValues();
     SaveSystemValues();
-    SaveMiscellaneousValues();
-    SaveDebuggingValues();
-    SaveWebServiceValues();
-    SaveServiceValues();
-    SaveDisabledAddOnValues();
-    SaveUIValues();
 }
 
 void Config::SaveAudioValues() {
     qt_config->beginGroup(QStringLiteral("Audio"));
 
-    WriteSetting(QStringLiteral("output_engine"), QString::fromStdString(Settings::values.sink_id),
-                 QStringLiteral("auto"));
-    WriteSetting(QStringLiteral("enable_audio_stretching"),
-                 Settings::values.enable_audio_stretching, true);
-    WriteSetting(QStringLiteral("output_device"),
-                 QString::fromStdString(Settings::values.audio_device_id), QStringLiteral("auto"));
-    WriteSetting(QStringLiteral("volume"), Settings::values.volume, 1.0f);
+    if (global) {
+        WriteSetting(QStringLiteral("output_engine"),
+                     QString::fromStdString(Settings::values.sink_id), QStringLiteral("auto"));
+        WriteSetting(QStringLiteral("output_device"),
+                     QString::fromStdString(Settings::values.audio_device_id),
+                     QStringLiteral("auto"));
+    }
+    WriteSettingGlobal(QStringLiteral("enable_audio_stretching"),
+                       Settings::values.enable_audio_stretching, true);
+    WriteSettingGlobal(QStringLiteral("volume"), Settings::values.volume, 1.0f);
 
     qt_config->endGroup();
 }
@@ -947,6 +969,7 @@ void Config::SaveControlValues() {
     WriteSetting(QStringLiteral("udp_input_port"), Settings::values.udp_input_port,
                  InputCommon::CemuhookUDP::DEFAULT_PORT);
     WriteSetting(QStringLiteral("udp_pad_index"), Settings::values.udp_pad_index, 0);
+    WriteSetting(QStringLiteral("use_docked_mode"), Settings::values.use_docked_mode, false);
 
     qt_config->endGroup();
 }
@@ -954,7 +977,7 @@ void Config::SaveControlValues() {
 void Config::SaveCoreValues() {
     qt_config->beginGroup(QStringLiteral("Core"));
 
-    WriteSetting(QStringLiteral("use_multi_core"), Settings::values.use_multi_core, false);
+    WriteSettingGlobal(QStringLiteral("use_multi_core"), Settings::values.use_multi_core, false);
 
     qt_config->endGroup();
 }
@@ -1011,6 +1034,7 @@ void Config::SaveDebuggingValues() {
     WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
     WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
     WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
+    WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);
 
     qt_config->endGroup();
 }
@@ -1076,31 +1100,34 @@ void Config::SavePathValues() {
 void Config::SaveRendererValues() {
     qt_config->beginGroup(QStringLiteral("Renderer"));
 
-    WriteSetting(QStringLiteral("backend"), static_cast<int>(Settings::values.renderer_backend), 0);
+    WriteSettingGlobal(QStringLiteral("backend"),
+                       static_cast<int>(Settings::values.renderer_backend.GetValue(global)),
+                       Settings::values.renderer_backend.UsingGlobal(), 0);
     WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false);
-    WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0);
-    WriteSetting(QStringLiteral("resolution_factor"),
-                 static_cast<double>(Settings::values.resolution_factor), 1.0);
-    WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0);
-    WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0);
-    WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true);
-    WriteSetting(QStringLiteral("frame_limit"), Settings::values.frame_limit, 100);
-    WriteSetting(QStringLiteral("use_disk_shader_cache"), Settings::values.use_disk_shader_cache,
-                 true);
-    WriteSetting(QStringLiteral("gpu_accuracy"), static_cast<int>(Settings::values.gpu_accuracy),
-                 0);
-    WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
-                 Settings::values.use_asynchronous_gpu_emulation, false);
-    WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
-    WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
-                 false);
-    WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
-    WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);
+    WriteSettingGlobal(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0);
+    WriteSettingGlobal(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0);
+    WriteSettingGlobal(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0);
+    WriteSettingGlobal(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true);
+    WriteSettingGlobal(QStringLiteral("frame_limit"), Settings::values.frame_limit, 100);
+    WriteSettingGlobal(QStringLiteral("use_disk_shader_cache"),
+                       Settings::values.use_disk_shader_cache, true);
+    WriteSettingGlobal(QStringLiteral("gpu_accuracy"),
+                       static_cast<int>(Settings::values.gpu_accuracy.GetValue(global)),
+                       Settings::values.gpu_accuracy.UsingGlobal(), 0);
+    WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),
+                       Settings::values.use_asynchronous_gpu_emulation, false);
+    WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
+    WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
+                       Settings::values.use_assembly_shaders, false);
+    WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time,
+                       true);
+    WriteSettingGlobal(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode,
+                       false);
 
     // Cast to double because Qt's written float values are not human-readable
-    WriteSetting(QStringLiteral("bg_red"), static_cast<double>(Settings::values.bg_red), 0.0);
-    WriteSetting(QStringLiteral("bg_green"), static_cast<double>(Settings::values.bg_green), 0.0);
-    WriteSetting(QStringLiteral("bg_blue"), static_cast<double>(Settings::values.bg_blue), 0.0);
+    WriteSettingGlobal(QStringLiteral("bg_red"), Settings::values.bg_red, 0.0);
+    WriteSettingGlobal(QStringLiteral("bg_green"), Settings::values.bg_green, 0.0);
+    WriteSettingGlobal(QStringLiteral("bg_blue"), Settings::values.bg_blue, 0.0);
 
     qt_config->endGroup();
 }
@@ -1128,23 +1155,28 @@ void Config::SaveShortcutValues() {
 void Config::SaveSystemValues() {
     qt_config->beginGroup(QStringLiteral("System"));
 
-    WriteSetting(QStringLiteral("use_docked_mode"), Settings::values.use_docked_mode, false);
     WriteSetting(QStringLiteral("current_user"), Settings::values.current_user, 0);
-    WriteSetting(QStringLiteral("language_index"), Settings::values.language_index, 1);
-    WriteSetting(QStringLiteral("region_index"), Settings::values.region_index, 1);
-    WriteSetting(QStringLiteral("time_zone_index"), Settings::values.time_zone_index, 0);
-
-    WriteSetting(QStringLiteral("rng_seed_enabled"), Settings::values.rng_seed.has_value(), false);
-    WriteSetting(QStringLiteral("rng_seed"), Settings::values.rng_seed.value_or(0), 0);
-
-    WriteSetting(QStringLiteral("custom_rtc_enabled"), Settings::values.custom_rtc.has_value(),
-                 false);
-    WriteSetting(QStringLiteral("custom_rtc"),
-                 QVariant::fromValue<long long>(
-                     Settings::values.custom_rtc.value_or(std::chrono::seconds{}).count()),
-                 0);
-
-    WriteSetting(QStringLiteral("sound_index"), Settings::values.sound_index, 1);
+    WriteSettingGlobal(QStringLiteral("language_index"), Settings::values.language_index, 1);
+    WriteSettingGlobal(QStringLiteral("region_index"), Settings::values.region_index, 1);
+    WriteSettingGlobal(QStringLiteral("time_zone_index"), Settings::values.time_zone_index, 0);
+
+    WriteSettingGlobal(QStringLiteral("rng_seed_enabled"),
+                       Settings::values.rng_seed.GetValue(global).has_value(),
+                       Settings::values.rng_seed.UsingGlobal(), false);
+    WriteSettingGlobal(QStringLiteral("rng_seed"),
+                       Settings::values.rng_seed.GetValue(global).value_or(0),
+                       Settings::values.rng_seed.UsingGlobal(), 0);
+
+    WriteSettingGlobal(QStringLiteral("custom_rtc_enabled"),
+                       Settings::values.custom_rtc.GetValue(global).has_value(),
+                       Settings::values.custom_rtc.UsingGlobal(), false);
+    WriteSettingGlobal(
+        QStringLiteral("custom_rtc"),
+        QVariant::fromValue<long long>(
+            Settings::values.custom_rtc.GetValue(global).value_or(std::chrono::seconds{}).count()),
+        Settings::values.custom_rtc.UsingGlobal(), 0);
+
+    WriteSettingGlobal(QStringLiteral("sound_index"), Settings::values.sound_index, 1);
 
     qt_config->endGroup();
 }
@@ -1156,8 +1188,6 @@ void Config::SaveUIValues() {
                  QString::fromUtf8(UISettings::themes[0].second));
     WriteSetting(QStringLiteral("enable_discord_presence"),
                  UISettings::values.enable_discord_presence, true);
-    WriteSetting(QStringLiteral("screenshot_resolution_factor"),
-                 UISettings::values.screenshot_resolution_factor, 0);
     WriteSetting(QStringLiteral("select_user_on_boot"), UISettings::values.select_user_on_boot,
                  false);
 
@@ -1238,6 +1268,34 @@ QVariant Config::ReadSetting(const QString& name, const QVariant& default_value)
     return result;
 }
 
+template <typename Type>
+void Config::ReadSettingGlobal(Settings::Setting<Type>& setting, const QString& name) {
+    const bool use_global = qt_config->value(name + QStringLiteral("/use_global"), true).toBool();
+    setting.SetGlobal(use_global);
+    if (global || !use_global) {
+        setting.SetValue(ReadSetting(name).value<Type>());
+    }
+}
+
+template <typename Type>
+void Config::ReadSettingGlobal(Settings::Setting<Type>& setting, const QString& name,
+                               const QVariant& default_value) {
+    const bool use_global = qt_config->value(name + QStringLiteral("/use_global"), true).toBool();
+    setting.SetGlobal(use_global);
+    if (global || !use_global) {
+        setting.SetValue(ReadSetting(name, default_value).value<Type>());
+    }
+}
+
+template <typename Type>
+void Config::ReadSettingGlobal(Type& setting, const QString& name,
+                               const QVariant& default_value) const {
+    const bool use_global = qt_config->value(name + QStringLiteral("/use_global"), true).toBool();
+    if (global || !use_global) {
+        setting = ReadSetting(name, default_value).value<Type>();
+    }
+}
+
 void Config::WriteSetting(const QString& name, const QVariant& value) {
     qt_config->setValue(name, value);
 }
@@ -1248,6 +1306,40 @@ void Config::WriteSetting(const QString& name, const QVariant& value,
     qt_config->setValue(name, value);
 }
 
+template <typename Type>
+void Config::WriteSettingGlobal(const QString& name, const Settings::Setting<Type>& setting) {
+    if (!global) {
+        qt_config->setValue(name + QStringLiteral("/use_global"), setting.UsingGlobal());
+    }
+    if (global || !setting.UsingGlobal()) {
+        qt_config->setValue(name, setting.GetValue(global));
+    }
+}
+
+template <typename Type>
+void Config::WriteSettingGlobal(const QString& name, const Settings::Setting<Type>& setting,
+                                const QVariant& default_value) {
+    if (!global) {
+        qt_config->setValue(name + QStringLiteral("/use_global"), setting.UsingGlobal());
+    }
+    if (global || !setting.UsingGlobal()) {
+        qt_config->setValue(name + QStringLiteral("/default"),
+                            setting.GetValue(global) == default_value.value<Type>());
+        qt_config->setValue(name, setting.GetValue(global));
+    }
+}
+
+void Config::WriteSettingGlobal(const QString& name, const QVariant& value, bool use_global,
+                                const QVariant& default_value) {
+    if (!global) {
+        qt_config->setValue(name + QStringLiteral("/use_global"), use_global);
+    }
+    if (global || !use_global) {
+        qt_config->setValue(name + QStringLiteral("/default"), value == default_value);
+        qt_config->setValue(name, value);
+    }
+}
+
 void Config::Reload() {
     ReadValues();
     // To apply default value changes
diff --git a/src/yuzu/configuration/config.h b/src/yuzu/configuration/config.h
index 5cd2a5feb..681f0bca5 100644
--- a/src/yuzu/configuration/config.h
+++ b/src/yuzu/configuration/config.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <memory>
 #include <string>
+#include <QMetaType>
 #include <QVariant>
 #include "core/settings.h"
 #include "yuzu/uisettings.h"
@@ -15,7 +16,7 @@ class QSettings;
 
 class Config {
 public:
-    Config();
+    explicit Config(const std::string& config_loc = "qt-config.ini", bool is_global = true);
     ~Config();
 
     void Reload();
@@ -27,7 +28,7 @@ public:
         default_mouse_buttons;
     static const std::array<int, Settings::NativeKeyboard::NumKeyboardKeys> default_keyboard_keys;
     static const std::array<int, Settings::NativeKeyboard::NumKeyboardMods> default_keyboard_mods;
-    static const std::array<UISettings::Shortcut, 15> default_hotkeys;
+    static const std::array<UISettings::Shortcut, 16> default_hotkeys;
 
 private:
     void ReadValues();
@@ -82,9 +83,33 @@ private:
 
     QVariant ReadSetting(const QString& name) const;
     QVariant ReadSetting(const QString& name, const QVariant& default_value) const;
+    // Templated ReadSettingGlobal functions will also look for the use_global setting and set
+    // both the value and the global state properly
+    template <typename Type>
+    void ReadSettingGlobal(Settings::Setting<Type>& setting, const QString& name);
+    template <typename Type>
+    void ReadSettingGlobal(Settings::Setting<Type>& setting, const QString& name,
+                           const QVariant& default_value);
+    template <typename Type>
+    void ReadSettingGlobal(Type& setting, const QString& name, const QVariant& default_value) const;
+    // Templated WriteSettingGlobal functions will also write the global state if needed and will
+    // skip writing the actual setting if it defers to the global value
     void WriteSetting(const QString& name, const QVariant& value);
     void WriteSetting(const QString& name, const QVariant& value, const QVariant& default_value);
+    template <typename Type>
+    void WriteSettingGlobal(const QString& name, const Settings::Setting<Type>& setting);
+    template <typename Type>
+    void WriteSettingGlobal(const QString& name, const Settings::Setting<Type>& setting,
+                            const QVariant& default_value);
+    void WriteSettingGlobal(const QString& name, const QVariant& value, bool use_global,
+                            const QVariant& default_value);
 
     std::unique_ptr<QSettings> qt_config;
     std::string qt_config_loc;
+
+    bool global;
 };
+
+// These metatype declarations cannot be in core/settings.h because core is devoid of QT
+Q_DECLARE_METATYPE(Settings::RendererBackend);
+Q_DECLARE_METATYPE(Settings::GPUAccuracy);
diff --git a/src/yuzu/configuration/configuration_shared.cpp b/src/yuzu/configuration/configuration_shared.cpp
new file mode 100644
index 000000000..bb47c3933
--- /dev/null
+++ b/src/yuzu/configuration/configuration_shared.cpp
@@ -0,0 +1,76 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <QCheckBox>
+#include <QComboBox>
+#include "core/settings.h"
+#include "yuzu/configuration/configuration_shared.h"
+#include "yuzu/configuration/configure_per_game.h"
+
+void ConfigurationShared::ApplyPerGameSetting(Settings::Setting<bool>* setting,
+                                              const QCheckBox* checkbox) {
+    if (checkbox->checkState() == Qt::PartiallyChecked) {
+        setting->SetGlobal(true);
+    } else {
+        setting->SetGlobal(false);
+        setting->SetValue(checkbox->checkState() == Qt::Checked);
+    }
+}
+
+void ConfigurationShared::ApplyPerGameSetting(Settings::Setting<int>* setting,
+                                              const QComboBox* combobox) {
+    if (combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+        setting->SetGlobal(true);
+    } else {
+        setting->SetGlobal(false);
+        setting->SetValue(combobox->currentIndex() - ConfigurationShared::USE_GLOBAL_OFFSET);
+    }
+}
+
+void ConfigurationShared::ApplyPerGameSetting(Settings::Setting<Settings::RendererBackend>* setting,
+                                              const QComboBox* combobox) {
+    if (combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+        setting->SetGlobal(true);
+    } else {
+        setting->SetGlobal(false);
+        setting->SetValue(static_cast<Settings::RendererBackend>(
+            combobox->currentIndex() - ConfigurationShared::USE_GLOBAL_OFFSET));
+    }
+}
+
+void ConfigurationShared::SetPerGameSetting(QCheckBox* checkbox,
+                                            const Settings::Setting<bool>* setting) {
+    if (setting->UsingGlobal()) {
+        checkbox->setCheckState(Qt::PartiallyChecked);
+    } else {
+        checkbox->setCheckState(setting->GetValue() ? Qt::Checked : Qt::Unchecked);
+    }
+}
+
+void ConfigurationShared::SetPerGameSetting(QComboBox* combobox,
+                                            const Settings::Setting<int>* setting) {
+    combobox->setCurrentIndex(setting->UsingGlobal()
+                                  ? ConfigurationShared::USE_GLOBAL_INDEX
+                                  : setting->GetValue() + ConfigurationShared::USE_GLOBAL_OFFSET);
+}
+
+void ConfigurationShared::SetPerGameSetting(
+    QComboBox* combobox, const Settings::Setting<Settings::RendererBackend>* setting) {
+    combobox->setCurrentIndex(setting->UsingGlobal() ? ConfigurationShared::USE_GLOBAL_INDEX
+                                                     : static_cast<int>(setting->GetValue()) +
+                                                           ConfigurationShared::USE_GLOBAL_OFFSET);
+}
+
+void ConfigurationShared::SetPerGameSetting(
+    QComboBox* combobox, const Settings::Setting<Settings::GPUAccuracy>* setting) {
+    combobox->setCurrentIndex(setting->UsingGlobal() ? ConfigurationShared::USE_GLOBAL_INDEX
+                                                     : static_cast<int>(setting->GetValue()) +
+                                                           ConfigurationShared::USE_GLOBAL_OFFSET);
+}
+
+void ConfigurationShared::InsertGlobalItem(QComboBox* combobox) {
+    const QString use_global_text = ConfigurePerGame::tr("Use global configuration");
+    combobox->insertItem(ConfigurationShared::USE_GLOBAL_INDEX, use_global_text);
+    combobox->insertSeparator(ConfigurationShared::USE_GLOBAL_SEPARATOR_INDEX);
+}
diff --git a/src/yuzu/configuration/configuration_shared.h b/src/yuzu/configuration/configuration_shared.h
new file mode 100644
index 000000000..b11b1b950
--- /dev/null
+++ b/src/yuzu/configuration/configuration_shared.h
@@ -0,0 +1,36 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <QCheckBox>
+#include <QComboBox>
+#include <QString>
+#include "core/settings.h"
+
+namespace ConfigurationShared {
+
+constexpr int USE_GLOBAL_INDEX = 0;
+constexpr int USE_GLOBAL_SEPARATOR_INDEX = 1;
+constexpr int USE_GLOBAL_OFFSET = 2;
+
+// Global-aware apply and set functions
+
+void ApplyPerGameSetting(Settings::Setting<bool>* setting, const QCheckBox* checkbox);
+void ApplyPerGameSetting(Settings::Setting<int>* setting, const QComboBox* combobox);
+void ApplyPerGameSetting(Settings::Setting<Settings::RendererBackend>* setting,
+                         const QComboBox* combobox);
+void ApplyPerGameSetting(Settings::Setting<Settings::GPUAccuracy>* setting,
+                         const QComboBox* combobox);
+
+void SetPerGameSetting(QCheckBox* checkbox, const Settings::Setting<bool>* setting);
+void SetPerGameSetting(QComboBox* combobox, const Settings::Setting<int>* setting);
+void SetPerGameSetting(QComboBox* combobox,
+                       const Settings::Setting<Settings::RendererBackend>* setting);
+void SetPerGameSetting(QComboBox* combobox,
+                       const Settings::Setting<Settings::GPUAccuracy>* setting);
+
+void InsertGlobalItem(QComboBox* combobox);
+
+} // namespace ConfigurationShared
diff --git a/src/yuzu/configuration/configure_audio.cpp b/src/yuzu/configuration/configure_audio.cpp
index f370c690f..cc021beec 100644
--- a/src/yuzu/configuration/configure_audio.cpp
+++ b/src/yuzu/configuration/configure_audio.cpp
@@ -11,6 +11,7 @@
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_audio.h"
+#include "yuzu/configuration/configuration_shared.h"
 #include "yuzu/configuration/configure_audio.h"
 
 ConfigureAudio::ConfigureAudio(QWidget* parent)
@@ -24,6 +25,11 @@ ConfigureAudio::ConfigureAudio(QWidget* parent)
     connect(ui->output_sink_combo_box, qOverload<int>(&QComboBox::currentIndexChanged), this,
             &ConfigureAudio::UpdateAudioDevices);
 
+    ui->volume_label->setVisible(Settings::configuring_global);
+    ui->volume_combo_box->setVisible(!Settings::configuring_global);
+
+    SetupPerGameUI();
+
     SetConfiguration();
 
     const bool is_powered_on = Core::System::GetInstance().IsPoweredOn();
@@ -41,8 +47,22 @@ void ConfigureAudio::SetConfiguration() {
 
     SetAudioDeviceFromDeviceID();
 
-    ui->toggle_audio_stretching->setChecked(Settings::values.enable_audio_stretching);
-    ui->volume_slider->setValue(Settings::values.volume * ui->volume_slider->maximum());
+    ui->volume_slider->setValue(Settings::values.volume.GetValue() * ui->volume_slider->maximum());
+
+    if (Settings::configuring_global) {
+        ui->toggle_audio_stretching->setChecked(
+            Settings::values.enable_audio_stretching.GetValue());
+    } else {
+        ConfigurationShared::SetPerGameSetting(ui->toggle_audio_stretching,
+                                               &Settings::values.enable_audio_stretching);
+        if (Settings::values.volume.UsingGlobal()) {
+            ui->volume_combo_box->setCurrentIndex(0);
+            ui->volume_slider->setEnabled(false);
+        } else {
+            ui->volume_combo_box->setCurrentIndex(1);
+            ui->volume_slider->setEnabled(true);
+        }
+    }
     SetVolumeIndicatorText(ui->volume_slider->sliderPosition());
 }
 
@@ -80,15 +100,36 @@ void ConfigureAudio::SetVolumeIndicatorText(int percentage) {
 }
 
 void ConfigureAudio::ApplyConfiguration() {
-    Settings::values.sink_id =
-        ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex())
-            .toStdString();
-    Settings::values.enable_audio_stretching = ui->toggle_audio_stretching->isChecked();
-    Settings::values.audio_device_id =
-        ui->audio_device_combo_box->itemText(ui->audio_device_combo_box->currentIndex())
-            .toStdString();
-    Settings::values.volume =
-        static_cast<float>(ui->volume_slider->sliderPosition()) / ui->volume_slider->maximum();
+    if (Settings::configuring_global) {
+        Settings::values.sink_id =
+            ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex())
+                .toStdString();
+        Settings::values.audio_device_id =
+            ui->audio_device_combo_box->itemText(ui->audio_device_combo_box->currentIndex())
+                .toStdString();
+
+        // Guard if during game and set to game-specific value
+        if (Settings::values.enable_audio_stretching.UsingGlobal()) {
+            Settings::values.enable_audio_stretching.SetValue(
+                ui->toggle_audio_stretching->isChecked());
+        }
+        if (Settings::values.volume.UsingGlobal()) {
+            Settings::values.volume.SetValue(
+                static_cast<float>(ui->volume_slider->sliderPosition()) /
+                ui->volume_slider->maximum());
+        }
+    } else {
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.enable_audio_stretching,
+                                                 ui->toggle_audio_stretching);
+        if (ui->volume_combo_box->currentIndex() == 0) {
+            Settings::values.volume.SetGlobal(true);
+        } else {
+            Settings::values.volume.SetGlobal(false);
+            Settings::values.volume.SetValue(
+                static_cast<float>(ui->volume_slider->sliderPosition()) /
+                ui->volume_slider->maximum());
+        }
+    }
 }
 
 void ConfigureAudio::changeEvent(QEvent* event) {
@@ -122,3 +163,22 @@ void ConfigureAudio::RetranslateUI() {
     ui->retranslateUi(this);
     SetVolumeIndicatorText(ui->volume_slider->sliderPosition());
 }
+
+void ConfigureAudio::SetupPerGameUI() {
+    if (Settings::configuring_global) {
+        ui->volume_slider->setEnabled(Settings::values.volume.UsingGlobal());
+        ui->toggle_audio_stretching->setEnabled(
+            Settings::values.enable_audio_stretching.UsingGlobal());
+
+        return;
+    }
+
+    ui->toggle_audio_stretching->setTristate(true);
+    connect(ui->volume_combo_box, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated),
+            this, [this](int index) { ui->volume_slider->setEnabled(index == 1); });
+
+    ui->output_sink_combo_box->setVisible(false);
+    ui->output_sink_label->setVisible(false);
+    ui->audio_device_combo_box->setVisible(false);
+    ui->audio_device_label->setVisible(false);
+}
diff --git a/src/yuzu/configuration/configure_audio.h b/src/yuzu/configuration/configure_audio.h
index ea83bd72d..d84f4a682 100644
--- a/src/yuzu/configuration/configure_audio.h
+++ b/src/yuzu/configuration/configure_audio.h
@@ -34,5 +34,7 @@ private:
     void SetAudioDeviceFromDeviceID();
     void SetVolumeIndicatorText(int percentage);
 
+    void SetupPerGameUI();
+
     std::unique_ptr<Ui::ConfigureAudio> ui;
 };
diff --git a/src/yuzu/configuration/configure_audio.ui b/src/yuzu/configuration/configure_audio.ui
index a098b9acc..862ccb988 100644
--- a/src/yuzu/configuration/configure_audio.ui
+++ b/src/yuzu/configuration/configure_audio.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>188</width>
-    <height>246</height>
+    <width>367</width>
+    <height>368</height>
    </rect>
   </property>
   <layout class="QVBoxLayout">
@@ -18,9 +18,9 @@
      </property>
      <layout class="QVBoxLayout">
       <item>
-       <layout class="QHBoxLayout">
+       <layout class="QHBoxLayout" name="_3">
         <item>
-         <widget class="QLabel" name="label_1">
+         <widget class="QLabel" name="output_sink_label">
           <property name="text">
            <string>Output Engine:</string>
           </property>
@@ -31,20 +31,20 @@
         </item>
        </layout>
       </item>
-       <item>
-         <widget class="QCheckBox" name="toggle_audio_stretching">
-           <property name="toolTip">
-             <string>This post-processing effect adjusts audio speed to match emulation speed and helps prevent audio stutter. This however increases audio latency.</string>
-           </property>
-           <property name="text">
-             <string>Enable audio stretching</string>
-           </property>
-         </widget>
-       </item>
       <item>
-       <layout class="QHBoxLayout">
+       <widget class="QCheckBox" name="toggle_audio_stretching">
+        <property name="toolTip">
+         <string>This post-processing effect adjusts audio speed to match emulation speed and helps prevent audio stutter. This however increases audio latency.</string>
+        </property>
+        <property name="text">
+         <string>Enable audio stretching</string>
+        </property>
+       </widget>
+      </item>
+      <item>
+       <layout class="QHBoxLayout" name="_2">
         <item>
-         <widget class="QLabel" name="label_2">
+         <widget class="QLabel" name="audio_device_label">
           <property name="text">
            <string>Audio Device:</string>
           </property>
@@ -61,7 +61,21 @@
          <number>0</number>
         </property>
         <item>
-         <widget class="QLabel" name="label_3">
+         <widget class="QComboBox" name="volume_combo_box">
+          <item>
+           <property name="text">
+            <string>Use global volume</string>
+           </property>
+          </item>
+          <item>
+           <property name="text">
+            <string>Set volume:</string>
+           </property>
+          </item>
+         </widget>
+        </item>
+        <item>
+         <widget class="QLabel" name="volume_label">
           <property name="text">
            <string>Volume:</string>
           </property>
@@ -74,7 +88,7 @@
           </property>
           <property name="sizeHint" stdset="0">
            <size>
-            <width>40</width>
+            <width>30</width>
             <height>20</height>
            </size>
           </property>
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index c2026763e..2c77441fd 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
     ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
     ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
     ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
+    ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
 }
 
 void ConfigureDebug::ApplyConfiguration() {
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
     Settings::values.quest_flag = ui->quest_flag->isChecked();
     Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
     Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
+    Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
     Debugger::ToggleConsole();
     Log::Filter filter;
     filter.ParseFilterString(Settings::values.log_filter);
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui
index e0d4c4a44..46f0208c6 100644
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -148,6 +148,19 @@
         </property>
        </widget>
       </item>
+      <item>
+       <widget class="QCheckBox" name="disable_macro_jit">
+        <property name="enabled">
+         <bool>true</bool>
+        </property>
+        <property name="whatsThis">
+         <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
+        </property>
+        <property name="text">
+         <string>Disable Macro JIT</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>
diff --git a/src/yuzu/configuration/configure_dialog.cpp b/src/yuzu/configuration/configure_dialog.cpp
index df4473b46..5918e9972 100644
--- a/src/yuzu/configuration/configure_dialog.cpp
+++ b/src/yuzu/configuration/configure_dialog.cpp
@@ -14,6 +14,8 @@
 
 ConfigureDialog::ConfigureDialog(QWidget* parent, HotkeyRegistry& registry)
     : QDialog(parent), ui(new Ui::ConfigureDialog), registry(registry) {
+    Settings::configuring_global = true;
+
     ui->setupUi(this);
     ui->hotkeysTab->Populate(registry);
     setWindowFlags(windowFlags() & ~Qt::WindowContextHelpButtonHint);
diff --git a/src/yuzu/configuration/configure_general.cpp b/src/yuzu/configuration/configure_general.cpp
index cb95423e0..1fb62d1cf 100644
--- a/src/yuzu/configuration/configure_general.cpp
+++ b/src/yuzu/configuration/configure_general.cpp
@@ -7,40 +7,79 @@
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_general.h"
+#include "yuzu/configuration/configuration_shared.h"
 #include "yuzu/configuration/configure_general.h"
 #include "yuzu/uisettings.h"
 
 ConfigureGeneral::ConfigureGeneral(QWidget* parent)
     : QWidget(parent), ui(new Ui::ConfigureGeneral) {
-
     ui->setupUi(this);
 
+    SetupPerGameUI();
+
     SetConfiguration();
 
-    connect(ui->toggle_frame_limit, &QCheckBox::toggled, ui->frame_limit, &QSpinBox::setEnabled);
+    connect(ui->toggle_frame_limit, &QCheckBox::stateChanged, ui->frame_limit, [this]() {
+        ui->frame_limit->setEnabled(ui->toggle_frame_limit->checkState() == Qt::Checked);
+    });
 }
 
 ConfigureGeneral::~ConfigureGeneral() = default;
 
 void ConfigureGeneral::SetConfiguration() {
+    const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn();
+
+    ui->use_multi_core->setEnabled(runtime_lock);
+    ui->use_multi_core->setChecked(Settings::values.use_multi_core.GetValue());
+
     ui->toggle_check_exit->setChecked(UISettings::values.confirm_before_closing);
     ui->toggle_user_on_boot->setChecked(UISettings::values.select_user_on_boot);
     ui->toggle_background_pause->setChecked(UISettings::values.pause_when_in_background);
     ui->toggle_hide_mouse->setChecked(UISettings::values.hide_mouse);
 
-    ui->toggle_frame_limit->setChecked(Settings::values.use_frame_limit);
-    ui->frame_limit->setEnabled(ui->toggle_frame_limit->isChecked());
-    ui->frame_limit->setValue(Settings::values.frame_limit);
+    ui->toggle_frame_limit->setChecked(Settings::values.use_frame_limit.GetValue());
+    ui->frame_limit->setValue(Settings::values.frame_limit.GetValue());
+
+    if (!Settings::configuring_global) {
+        if (Settings::values.use_multi_core.UsingGlobal()) {
+            ui->use_multi_core->setCheckState(Qt::PartiallyChecked);
+        }
+        if (Settings::values.use_frame_limit.UsingGlobal()) {
+            ui->toggle_frame_limit->setCheckState(Qt::PartiallyChecked);
+        }
+    }
+
+    ui->frame_limit->setEnabled(ui->toggle_frame_limit->checkState() == Qt::Checked &&
+                                ui->toggle_frame_limit->isEnabled());
 }
 
 void ConfigureGeneral::ApplyConfiguration() {
-    UISettings::values.confirm_before_closing = ui->toggle_check_exit->isChecked();
-    UISettings::values.select_user_on_boot = ui->toggle_user_on_boot->isChecked();
-    UISettings::values.pause_when_in_background = ui->toggle_background_pause->isChecked();
-    UISettings::values.hide_mouse = ui->toggle_hide_mouse->isChecked();
+    if (Settings::configuring_global) {
+        UISettings::values.confirm_before_closing = ui->toggle_check_exit->isChecked();
+        UISettings::values.select_user_on_boot = ui->toggle_user_on_boot->isChecked();
+        UISettings::values.pause_when_in_background = ui->toggle_background_pause->isChecked();
+        UISettings::values.hide_mouse = ui->toggle_hide_mouse->isChecked();
+
+        // Guard if during game and set to game-specific value
+        if (Settings::values.use_frame_limit.UsingGlobal()) {
+            Settings::values.use_frame_limit.SetValue(ui->toggle_frame_limit->checkState() ==
+                                                      Qt::Checked);
+            Settings::values.frame_limit.SetValue(ui->frame_limit->value());
+            Settings::values.use_multi_core.SetValue(ui->use_multi_core->isChecked());
+        }
+    } else {
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_multi_core,
+                                                 ui->use_multi_core);
 
-    Settings::values.use_frame_limit = ui->toggle_frame_limit->isChecked();
-    Settings::values.frame_limit = ui->frame_limit->value();
+        bool global_frame_limit = ui->toggle_frame_limit->checkState() == Qt::PartiallyChecked;
+        Settings::values.use_frame_limit.SetGlobal(global_frame_limit);
+        Settings::values.frame_limit.SetGlobal(global_frame_limit);
+        if (!global_frame_limit) {
+            Settings::values.use_frame_limit.SetValue(ui->toggle_frame_limit->checkState() ==
+                                                      Qt::Checked);
+            Settings::values.frame_limit.SetValue(ui->frame_limit->value());
+        }
+    }
 }
 
 void ConfigureGeneral::changeEvent(QEvent* event) {
@@ -54,3 +93,20 @@ void ConfigureGeneral::changeEvent(QEvent* event) {
 void ConfigureGeneral::RetranslateUI() {
     ui->retranslateUi(this);
 }
+
+void ConfigureGeneral::SetupPerGameUI() {
+    if (Settings::configuring_global) {
+        ui->toggle_frame_limit->setEnabled(Settings::values.use_frame_limit.UsingGlobal());
+        ui->frame_limit->setEnabled(Settings::values.frame_limit.UsingGlobal());
+
+        return;
+    }
+
+    ui->toggle_check_exit->setVisible(false);
+    ui->toggle_user_on_boot->setVisible(false);
+    ui->toggle_background_pause->setVisible(false);
+    ui->toggle_hide_mouse->setVisible(false);
+
+    ui->toggle_frame_limit->setTristate(true);
+    ui->use_multi_core->setTristate(true);
+}
diff --git a/src/yuzu/configuration/configure_general.h b/src/yuzu/configuration/configure_general.h
index ef05ce065..9c785c22e 100644
--- a/src/yuzu/configuration/configure_general.h
+++ b/src/yuzu/configuration/configure_general.h
@@ -28,5 +28,7 @@ private:
 
     void SetConfiguration();
 
+    void SetupPerGameUI();
+
     std::unique_ptr<Ui::ConfigureGeneral> ui;
 };
diff --git a/src/yuzu/configuration/configure_general.ui b/src/yuzu/configuration/configure_general.ui
index fc3b7e65a..2711116a2 100644
--- a/src/yuzu/configuration/configure_general.ui
+++ b/src/yuzu/configuration/configure_general.ui
@@ -52,6 +52,13 @@
            </layout>
           </item>
           <item>
+           <widget class="QCheckBox" name="use_multi_core">
+            <property name="text">
+             <string>Multicore CPU Emulation</string>
+            </property>
+           </widget>
+          </item>
+          <item>
            <widget class="QCheckBox" name="toggle_check_exit">
             <property name="text">
              <string>Confirm exit while emulation is running</string>
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index ea667caef..cb4706bd6 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -13,65 +13,27 @@
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_graphics.h"
+#include "yuzu/configuration/configuration_shared.h"
 #include "yuzu/configuration/configure_graphics.h"
 
 #ifdef HAS_VULKAN
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #endif
 
-namespace {
-enum class Resolution : int {
-    Auto,
-    Scale1x,
-    Scale2x,
-    Scale3x,
-    Scale4x,
-};
-
-float ToResolutionFactor(Resolution option) {
-    switch (option) {
-    case Resolution::Auto:
-        return 0.f;
-    case Resolution::Scale1x:
-        return 1.f;
-    case Resolution::Scale2x:
-        return 2.f;
-    case Resolution::Scale3x:
-        return 3.f;
-    case Resolution::Scale4x:
-        return 4.f;
-    }
-    return 0.f;
-}
-
-Resolution FromResolutionFactor(float factor) {
-    if (factor == 0.f) {
-        return Resolution::Auto;
-    } else if (factor == 1.f) {
-        return Resolution::Scale1x;
-    } else if (factor == 2.f) {
-        return Resolution::Scale2x;
-    } else if (factor == 3.f) {
-        return Resolution::Scale3x;
-    } else if (factor == 4.f) {
-        return Resolution::Scale4x;
-    }
-    return Resolution::Auto;
-}
-} // Anonymous namespace
-
 ConfigureGraphics::ConfigureGraphics(QWidget* parent)
     : QWidget(parent), ui(new Ui::ConfigureGraphics) {
-    vulkan_device = Settings::values.vulkan_device;
+    vulkan_device = Settings::values.vulkan_device.GetValue();
     RetrieveVulkanDevices();
 
     ui->setupUi(this);
 
+    SetupPerGameUI();
+
     SetConfiguration();
 
-    connect(ui->api, static_cast<void (QComboBox::*)(int)>(&QComboBox::currentIndexChanged), this,
+    connect(ui->api, qOverload<int>(&QComboBox::currentIndexChanged), this,
             [this] { UpdateDeviceComboBox(); });
-    connect(ui->device, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this,
+    connect(ui->device, qOverload<int>(&QComboBox::activated), this,
             [this](int device) { UpdateDeviceSelection(device); });
 
     connect(ui->bg_button, &QPushButton::clicked, this, [this] {
@@ -81,6 +43,9 @@ ConfigureGraphics::ConfigureGraphics(QWidget* parent)
         }
         UpdateBackgroundColorButton(new_bg_color);
     });
+
+    ui->bg_label->setVisible(Settings::configuring_global);
+    ui->bg_combobox->setVisible(!Settings::configuring_global);
 }
 
 void ConfigureGraphics::UpdateDeviceSelection(int device) {
@@ -98,31 +63,95 @@ void ConfigureGraphics::SetConfiguration() {
     const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn();
 
     ui->api->setEnabled(runtime_lock);
-    ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend));
-    ui->resolution_factor_combobox->setCurrentIndex(
-        static_cast<int>(FromResolutionFactor(Settings::values.resolution_factor)));
-    ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio);
-    ui->use_disk_shader_cache->setEnabled(runtime_lock);
-    ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
     ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
-    ui->use_asynchronous_gpu_emulation->setChecked(Settings::values.use_asynchronous_gpu_emulation);
-    UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green,
-                                                 Settings::values.bg_blue));
+    ui->use_disk_shader_cache->setEnabled(runtime_lock);
+
+    if (Settings::configuring_global) {
+        ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue()));
+        ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio.GetValue());
+        ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
+        ui->use_asynchronous_gpu_emulation->setChecked(
+            Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    } else {
+        ConfigurationShared::SetPerGameSetting(ui->use_disk_shader_cache,
+                                               &Settings::values.use_disk_shader_cache);
+        ConfigurationShared::SetPerGameSetting(ui->use_asynchronous_gpu_emulation,
+                                               &Settings::values.use_asynchronous_gpu_emulation);
+
+        ConfigurationShared::SetPerGameSetting(ui->api, &Settings::values.renderer_backend);
+        ConfigurationShared::SetPerGameSetting(ui->aspect_ratio_combobox,
+                                               &Settings::values.aspect_ratio);
+
+        ui->bg_combobox->setCurrentIndex(Settings::values.bg_red.UsingGlobal() ? 0 : 1);
+        ui->bg_button->setEnabled(!Settings::values.bg_red.UsingGlobal());
+    }
+
+    UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red.GetValue(),
+                                                 Settings::values.bg_green.GetValue(),
+                                                 Settings::values.bg_blue.GetValue()));
     UpdateDeviceComboBox();
 }
 
 void ConfigureGraphics::ApplyConfiguration() {
-    Settings::values.renderer_backend = GetCurrentGraphicsBackend();
-    Settings::values.vulkan_device = vulkan_device;
-    Settings::values.resolution_factor =
-        ToResolutionFactor(static_cast<Resolution>(ui->resolution_factor_combobox->currentIndex()));
-    Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex();
-    Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
-    Settings::values.use_asynchronous_gpu_emulation =
-        ui->use_asynchronous_gpu_emulation->isChecked();
-    Settings::values.bg_red = static_cast<float>(bg_color.redF());
-    Settings::values.bg_green = static_cast<float>(bg_color.greenF());
-    Settings::values.bg_blue = static_cast<float>(bg_color.blueF());
+    if (Settings::configuring_global) {
+        // Guard if during game and set to game-specific value
+        if (Settings::values.renderer_backend.UsingGlobal()) {
+            Settings::values.renderer_backend.SetValue(GetCurrentGraphicsBackend());
+        }
+        if (Settings::values.vulkan_device.UsingGlobal()) {
+            Settings::values.vulkan_device.SetValue(vulkan_device);
+        }
+        if (Settings::values.aspect_ratio.UsingGlobal()) {
+            Settings::values.aspect_ratio.SetValue(ui->aspect_ratio_combobox->currentIndex());
+        }
+        if (Settings::values.use_disk_shader_cache.UsingGlobal()) {
+            Settings::values.use_disk_shader_cache.SetValue(ui->use_disk_shader_cache->isChecked());
+        }
+        if (Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()) {
+            Settings::values.use_asynchronous_gpu_emulation.SetValue(
+                ui->use_asynchronous_gpu_emulation->isChecked());
+        }
+        if (Settings::values.bg_red.UsingGlobal()) {
+            Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));
+            Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF()));
+            Settings::values.bg_blue.SetValue(static_cast<float>(bg_color.blueF()));
+        }
+    } else {
+        if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+            Settings::values.renderer_backend.SetGlobal(true);
+            Settings::values.vulkan_device.SetGlobal(true);
+        } else {
+            Settings::values.renderer_backend.SetGlobal(false);
+            Settings::values.renderer_backend.SetValue(GetCurrentGraphicsBackend());
+            if (GetCurrentGraphicsBackend() == Settings::RendererBackend::Vulkan) {
+                Settings::values.vulkan_device.SetGlobal(false);
+                Settings::values.vulkan_device.SetValue(vulkan_device);
+            } else {
+                Settings::values.vulkan_device.SetGlobal(true);
+            }
+        }
+
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.aspect_ratio,
+                                                 ui->aspect_ratio_combobox);
+
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_disk_shader_cache,
+                                                 ui->use_disk_shader_cache);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,
+                                                 ui->use_asynchronous_gpu_emulation);
+
+        if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+            Settings::values.bg_red.SetGlobal(true);
+            Settings::values.bg_green.SetGlobal(true);
+            Settings::values.bg_blue.SetGlobal(true);
+        } else {
+            Settings::values.bg_red.SetGlobal(false);
+            Settings::values.bg_green.SetGlobal(false);
+            Settings::values.bg_blue.SetGlobal(false);
+            Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));
+            Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF()));
+            Settings::values.bg_blue.SetValue(static_cast<float>(bg_color.blueF()));
+        }
+    }
 }
 
 void ConfigureGraphics::changeEvent(QEvent* event) {
@@ -151,19 +180,27 @@ void ConfigureGraphics::UpdateDeviceComboBox() {
     ui->device->clear();
 
     bool enabled = false;
+
+    if (!Settings::configuring_global &&
+        ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+        vulkan_device = Settings::values.vulkan_device.GetValue();
+    }
     switch (GetCurrentGraphicsBackend()) {
     case Settings::RendererBackend::OpenGL:
         ui->device->addItem(tr("OpenGL Graphics Device"));
         enabled = false;
         break;
     case Settings::RendererBackend::Vulkan:
-        for (const auto device : vulkan_devices) {
+        for (const auto& device : vulkan_devices) {
             ui->device->addItem(device);
         }
         ui->device->setCurrentIndex(vulkan_device);
         enabled = !vulkan_devices.empty();
         break;
     }
+    // If in per-game config and use global is selected, don't enable.
+    enabled &= !(!Settings::configuring_global &&
+                 ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX);
     ui->device->setEnabled(enabled && !Core::System::GetInstance().IsPoweredOn());
 }
 
@@ -177,5 +214,37 @@ void ConfigureGraphics::RetrieveVulkanDevices() {
 }
 
 Settings::RendererBackend ConfigureGraphics::GetCurrentGraphicsBackend() const {
-    return static_cast<Settings::RendererBackend>(ui->api->currentIndex());
+    if (Settings::configuring_global) {
+        return static_cast<Settings::RendererBackend>(ui->api->currentIndex());
+    }
+
+    if (ui->api->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+        Settings::values.renderer_backend.SetGlobal(true);
+        return Settings::values.renderer_backend.GetValue();
+    }
+    Settings::values.renderer_backend.SetGlobal(false);
+    return static_cast<Settings::RendererBackend>(ui->api->currentIndex() -
+                                                  ConfigurationShared::USE_GLOBAL_OFFSET);
+}
+
+void ConfigureGraphics::SetupPerGameUI() {
+    if (Settings::configuring_global) {
+        ui->api->setEnabled(Settings::values.renderer_backend.UsingGlobal());
+        ui->device->setEnabled(Settings::values.renderer_backend.UsingGlobal());
+        ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());
+        ui->use_asynchronous_gpu_emulation->setEnabled(
+            Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
+        ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());
+        ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal());
+
+        return;
+    }
+
+    connect(ui->bg_combobox, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this,
+            [this](int index) { ui->bg_button->setEnabled(index == 1); });
+
+    ui->use_disk_shader_cache->setTristate(true);
+    ui->use_asynchronous_gpu_emulation->setTristate(true);
+    ConfigurationShared::InsertGlobalItem(ui->aspect_ratio_combobox);
+    ConfigurationShared::InsertGlobalItem(ui->api);
 }
diff --git a/src/yuzu/configuration/configure_graphics.h b/src/yuzu/configuration/configure_graphics.h
index 7e0596d9c..24f01c739 100644
--- a/src/yuzu/configuration/configure_graphics.h
+++ b/src/yuzu/configuration/configure_graphics.h
@@ -35,6 +35,8 @@ private:
 
     void RetrieveVulkanDevices();
 
+    void SetupPerGameUI();
+
     Settings::RendererBackend GetCurrentGraphicsBackend() const;
 
     std::unique_ptr<Ui::ConfigureGraphics> ui;
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index c816d6108..62418fc14 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -85,39 +85,34 @@
          </widget>
         </item>
         <item>
-         <layout class="QHBoxLayout" name="horizontalLayout_2">
+         <layout class="QHBoxLayout" name="horizontalLayout_6">
           <item>
-           <widget class="QLabel" name="label">
+           <widget class="QLabel" name="ar_label">
             <property name="text">
-             <string>Internal Resolution:</string>
+             <string>Aspect Ratio:</string>
             </property>
            </widget>
           </item>
           <item>
-           <widget class="QComboBox" name="resolution_factor_combobox">
-            <item>
-             <property name="text">
-              <string>Auto (Window Size)</string>
-             </property>
-            </item>
+           <widget class="QComboBox" name="aspect_ratio_combobox">
             <item>
              <property name="text">
-              <string>Native (1280x720)</string>
+              <string>Default (16:9)</string>
              </property>
             </item>
             <item>
              <property name="text">
-              <string>2x Native (2560x1440)</string>
+              <string>Force 4:3</string>
              </property>
             </item>
             <item>
              <property name="text">
-              <string>3x Native (3840x2160)</string>
+              <string>Force 21:9</string>
              </property>
             </item>
             <item>
              <property name="text">
-              <string>4x Native (5120x2880)</string>
+              <string>Stretch to Window</string>
              </property>
             </item>
            </widget>
@@ -125,42 +120,30 @@
          </layout>
         </item>
         <item>
-         <layout class="QHBoxLayout" name="horizontalLayout_6">
+         <layout class="QHBoxLayout" name="horizontalLayout_3">
           <item>
-           <widget class="QLabel" name="ar_label">
-            <property name="text">
-             <string>Aspect Ratio:</string>
+           <widget class="QComboBox" name="bg_combobox">
+            <property name="currentText">
+             <string>Use global background color</string>
+            </property>
+            <property name="currentIndex">
+             <number>0</number>
+            </property>
+            <property name="maxVisibleItems">
+             <number>10</number>
             </property>
-           </widget>
-          </item>
-          <item>
-           <widget class="QComboBox" name="aspect_ratio_combobox">
-            <item>
-             <property name="text">
-              <string>Default (16:9)</string>
-             </property>
-            </item>
-            <item>
-             <property name="text">
-              <string>Force 4:3</string>
-             </property>
-            </item>
             <item>
              <property name="text">
-              <string>Force 21:9</string>
+              <string>Use global background color</string>
              </property>
             </item>
             <item>
              <property name="text">
-              <string>Stretch to Window</string>
+              <string>Set background color:</string>
              </property>
             </item>
            </widget>
           </item>
-         </layout>
-        </item>
-        <item>
-         <layout class="QHBoxLayout" name="horizontalLayout_3">
           <item>
            <widget class="QLabel" name="bg_label">
             <property name="text">
@@ -169,6 +152,19 @@
            </widget>
           </item>
           <item>
+           <spacer name="horizontalSpacer">
+            <property name="orientation">
+             <enum>Qt::Horizontal</enum>
+            </property>
+            <property name="sizeHint" stdset="0">
+             <size>
+              <width>40</width>
+              <height>20</height>
+             </size>
+            </property>
+           </spacer>
+          </item>
+          <item>
            <widget class="QPushButton" name="bg_button">
             <property name="maximumSize">
              <size>
diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp
index 37aadf7f8..7c0fa7ec5 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -5,6 +5,7 @@
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_graphics_advanced.h"
+#include "yuzu/configuration/configuration_shared.h"
 #include "yuzu/configuration/configure_graphics_advanced.h"
 
 ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
@@ -12,8 +13,7 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
 
     ui->setupUi(this);
 
-    // TODO: Remove this after assembly shaders are fully integrated
-    ui->use_assembly_shaders->setVisible(false);
+    SetupPerGameUI();
 
     SetConfiguration();
 }
@@ -22,26 +22,81 @@ ConfigureGraphicsAdvanced::~ConfigureGraphicsAdvanced() = default;
 
 void ConfigureGraphicsAdvanced::SetConfiguration() {
     const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn();
-    ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
     ui->use_vsync->setEnabled(runtime_lock);
-    ui->use_vsync->setChecked(Settings::values.use_vsync);
     ui->use_assembly_shaders->setEnabled(runtime_lock);
-    ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
-    ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
     ui->force_30fps_mode->setEnabled(runtime_lock);
-    ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
     ui->anisotropic_filtering_combobox->setEnabled(runtime_lock);
-    ui->anisotropic_filtering_combobox->setCurrentIndex(Settings::values.max_anisotropy);
+
+    if (Settings::configuring_global) {
+        ui->gpu_accuracy->setCurrentIndex(
+            static_cast<int>(Settings::values.gpu_accuracy.GetValue()));
+        ui->use_vsync->setChecked(Settings::values.use_vsync.GetValue());
+        ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders.GetValue());
+        ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue());
+        ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode.GetValue());
+        ui->anisotropic_filtering_combobox->setCurrentIndex(
+            Settings::values.max_anisotropy.GetValue());
+    } else {
+        ConfigurationShared::SetPerGameSetting(ui->gpu_accuracy, &Settings::values.gpu_accuracy);
+        ConfigurationShared::SetPerGameSetting(ui->use_vsync, &Settings::values.use_vsync);
+        ConfigurationShared::SetPerGameSetting(ui->use_assembly_shaders,
+                                               &Settings::values.use_assembly_shaders);
+        ConfigurationShared::SetPerGameSetting(ui->use_fast_gpu_time,
+                                               &Settings::values.use_fast_gpu_time);
+        ConfigurationShared::SetPerGameSetting(ui->force_30fps_mode,
+                                               &Settings::values.force_30fps_mode);
+        ConfigurationShared::SetPerGameSetting(ui->anisotropic_filtering_combobox,
+                                               &Settings::values.max_anisotropy);
+    }
 }
 
 void ConfigureGraphicsAdvanced::ApplyConfiguration() {
-    auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
-    Settings::values.gpu_accuracy = gpu_accuracy;
-    Settings::values.use_vsync = ui->use_vsync->isChecked();
-    Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
-    Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
-    Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
-    Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
+    // Subtract 2 if configuring per-game (separator and "use global configuration" take 2 slots)
+    const auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(
+        ui->gpu_accuracy->currentIndex() -
+        ((Settings::configuring_global) ? 0 : ConfigurationShared::USE_GLOBAL_OFFSET));
+
+    if (Settings::configuring_global) {
+        // Must guard in case of a during-game configuration when set to be game-specific.
+        if (Settings::values.gpu_accuracy.UsingGlobal()) {
+            Settings::values.gpu_accuracy.SetValue(gpu_accuracy);
+        }
+        if (Settings::values.use_vsync.UsingGlobal()) {
+            Settings::values.use_vsync.SetValue(ui->use_vsync->isChecked());
+        }
+        if (Settings::values.use_assembly_shaders.UsingGlobal()) {
+            Settings::values.use_assembly_shaders.SetValue(ui->use_assembly_shaders->isChecked());
+        }
+        if (Settings::values.use_fast_gpu_time.UsingGlobal()) {
+            Settings::values.use_fast_gpu_time.SetValue(ui->use_fast_gpu_time->isChecked());
+        }
+        if (Settings::values.force_30fps_mode.UsingGlobal()) {
+            Settings::values.force_30fps_mode.SetValue(ui->force_30fps_mode->isChecked());
+        }
+        if (Settings::values.max_anisotropy.UsingGlobal()) {
+            Settings::values.max_anisotropy.SetValue(
+                ui->anisotropic_filtering_combobox->currentIndex());
+        }
+    } else {
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.max_anisotropy,
+                                                 ui->anisotropic_filtering_combobox);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_vsync, ui->use_vsync);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_assembly_shaders,
+                                                 ui->use_assembly_shaders);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_fast_gpu_time,
+                                                 ui->use_fast_gpu_time);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.force_30fps_mode,
+                                                 ui->force_30fps_mode);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.max_anisotropy,
+                                                 ui->anisotropic_filtering_combobox);
+
+        if (ui->gpu_accuracy->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
+            Settings::values.gpu_accuracy.SetGlobal(true);
+        } else {
+            Settings::values.gpu_accuracy.SetGlobal(false);
+            Settings::values.gpu_accuracy.SetValue(gpu_accuracy);
+        }
+    }
 }
 
 void ConfigureGraphicsAdvanced::changeEvent(QEvent* event) {
@@ -55,3 +110,25 @@ void ConfigureGraphicsAdvanced::changeEvent(QEvent* event) {
 void ConfigureGraphicsAdvanced::RetranslateUI() {
     ui->retranslateUi(this);
 }
+
+void ConfigureGraphicsAdvanced::SetupPerGameUI() {
+    // Disable if not global (only happens during game)
+    if (Settings::configuring_global) {
+        ui->gpu_accuracy->setEnabled(Settings::values.gpu_accuracy.UsingGlobal());
+        ui->use_vsync->setEnabled(Settings::values.use_vsync.UsingGlobal());
+        ui->use_assembly_shaders->setEnabled(Settings::values.use_assembly_shaders.UsingGlobal());
+        ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal());
+        ui->force_30fps_mode->setEnabled(Settings::values.force_30fps_mode.UsingGlobal());
+        ui->anisotropic_filtering_combobox->setEnabled(
+            Settings::values.max_anisotropy.UsingGlobal());
+
+        return;
+    }
+
+    ConfigurationShared::InsertGlobalItem(ui->gpu_accuracy);
+    ui->use_vsync->setTristate(true);
+    ui->use_assembly_shaders->setTristate(true);
+    ui->use_fast_gpu_time->setTristate(true);
+    ui->force_30fps_mode->setTristate(true);
+    ConfigurationShared::InsertGlobalItem(ui->anisotropic_filtering_combobox);
+}
diff --git a/src/yuzu/configuration/configure_graphics_advanced.h b/src/yuzu/configuration/configure_graphics_advanced.h
index bbc9d4355..c043588ff 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.h
+++ b/src/yuzu/configuration/configure_graphics_advanced.h
@@ -26,5 +26,7 @@ private:
 
     void SetConfiguration();
 
+    void SetupPerGameUI();
+
     std::unique_ptr<Ui::ConfigureGraphicsAdvanced> ui;
 };
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index e4eb5594b..00433926d 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -70,6 +70,20 @@ static QString ButtonToText(const Common::ParamPackage& param) {
         return GetKeyName(param.Get("code", 0));
     }
 
+    if (param.Get("engine", "") == "gcpad") {
+        if (param.Has("axis")) {
+            const QString axis_str = QString::fromStdString(param.Get("axis", ""));
+            const QString direction_str = QString::fromStdString(param.Get("direction", ""));
+
+            return QObject::tr("GC Axis %1%2").arg(axis_str, direction_str);
+        }
+        if (param.Has("button")) {
+            const QString button_str = QString::number(int(std::log2(param.Get("button", 0))));
+            return QObject::tr("GC Button %1").arg(button_str);
+        }
+        return GetKeyName(param.Get("code", 0));
+    }
+
     if (param.Get("engine", "") == "sdl") {
         if (param.Has("hat")) {
             const QString hat_str = QString::fromStdString(param.Get("hat", ""));
@@ -126,6 +140,25 @@ static QString AnalogToText(const Common::ParamPackage& param, const std::string
         return {};
     }
 
+    if (param.Get("engine", "") == "gcpad") {
+        if (dir == "modifier") {
+            return QObject::tr("[unused]");
+        }
+
+        if (dir == "left" || dir == "right") {
+            const QString axis_x_str = QString::fromStdString(param.Get("axis_x", ""));
+
+            return QObject::tr("GC Axis %1").arg(axis_x_str);
+        }
+
+        if (dir == "up" || dir == "down") {
+            const QString axis_y_str = QString::fromStdString(param.Get("axis_y", ""));
+
+            return QObject::tr("GC Axis %1").arg(axis_y_str);
+        }
+
+        return {};
+    }
     return QObject::tr("[unknown]");
 }
 
@@ -332,7 +365,8 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
 
         connect(analog_map_deadzone_and_modifier_slider[analog_id], &QSlider::valueChanged, [=] {
             const float slider_value = analog_map_deadzone_and_modifier_slider[analog_id]->value();
-            if (analogs_param[analog_id].Get("engine", "") == "sdl") {
+            if (analogs_param[analog_id].Get("engine", "") == "sdl" ||
+                analogs_param[analog_id].Get("engine", "") == "gcpad") {
                 analog_map_deadzone_and_modifier_slider_label[analog_id]->setText(
                     tr("Deadzone: %1%").arg(slider_value));
                 analogs_param[analog_id].Set("deadzone", slider_value / 100.0f);
@@ -352,6 +386,20 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
 
     connect(poll_timer.get(), &QTimer::timeout, [this] {
         Common::ParamPackage params;
+        if (InputCommon::GetGCButtons()->IsPolling()) {
+            params = InputCommon::GetGCButtons()->GetNextInput();
+            if (params.Has("engine")) {
+                SetPollingResult(params, false);
+                return;
+            }
+        }
+        if (InputCommon::GetGCAnalogs()->IsPolling()) {
+            params = InputCommon::GetGCAnalogs()->GetNextInput();
+            if (params.Has("engine")) {
+                SetPollingResult(params, false);
+                return;
+            }
+        }
         for (auto& poller : device_pollers) {
             params = poller->GetNextInput();
             if (params.Has("engine")) {
@@ -480,7 +528,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
             SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
         }
     }
+
     UpdateButtonLabels();
+    ApplyConfiguration();
 }
 
 void ConfigureInputPlayer::ClearAll() {
@@ -505,6 +555,7 @@ void ConfigureInputPlayer::ClearAll() {
     }
 
     UpdateButtonLabels();
+    ApplyConfiguration();
 }
 
 void ConfigureInputPlayer::UpdateButtonLabels() {
@@ -531,7 +582,7 @@ void ConfigureInputPlayer::UpdateButtonLabels() {
             analog_map_deadzone_and_modifier_slider_label[analog_id];
 
         if (param.Has("engine")) {
-            if (param.Get("engine", "") == "sdl") {
+            if (param.Get("engine", "") == "sdl" || param.Get("engine", "") == "gcpad") {
                 if (!param.Has("deadzone")) {
                     param.Set("deadzone", 0.1f);
                 }
@@ -580,6 +631,11 @@ void ConfigureInputPlayer::HandleClick(
 
     grabKeyboard();
     grabMouse();
+    if (type == InputCommon::Polling::DeviceType::Button) {
+        InputCommon::GetGCButtons()->BeginConfiguration();
+    } else {
+        InputCommon::GetGCAnalogs()->BeginConfiguration();
+    }
     timeout_timer->start(5000); // Cancel after 5 seconds
     poll_timer->start(200);     // Check for new inputs every 200ms
 }
@@ -593,6 +649,9 @@ void ConfigureInputPlayer::SetPollingResult(const Common::ParamPackage& params,
         poller->Stop();
     }
 
+    InputCommon::GetGCButtons()->EndConfiguration();
+    InputCommon::GetGCAnalogs()->EndConfiguration();
+
     if (!abort) {
         (*input_setter)(params);
     }
diff --git a/src/yuzu/configuration/configure_per_game.cpp b/src/yuzu/configuration/configure_per_game.cpp
new file mode 100644
index 000000000..1e49f0787
--- /dev/null
+++ b/src/yuzu/configuration/configure_per_game.cpp
@@ -0,0 +1,140 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include <QCheckBox>
+#include <QHeaderView>
+#include <QMenu>
+#include <QStandardItemModel>
+#include <QString>
+#include <QTimer>
+#include <QTreeView>
+
+#include "common/common_paths.h"
+#include "common/file_util.h"
+#include "core/file_sys/control_metadata.h"
+#include "core/file_sys/patch_manager.h"
+#include "core/file_sys/xts_archive.h"
+#include "core/loader/loader.h"
+#include "ui_configure_per_game.h"
+#include "yuzu/configuration/config.h"
+#include "yuzu/configuration/configure_input.h"
+#include "yuzu/configuration/configure_per_game.h"
+#include "yuzu/uisettings.h"
+#include "yuzu/util/util.h"
+
+ConfigurePerGame::ConfigurePerGame(QWidget* parent, u64 title_id)
+    : QDialog(parent), ui(std::make_unique<Ui::ConfigurePerGame>()), title_id(title_id) {
+    game_config = std::make_unique<Config>(fmt::format("{:016X}.ini", title_id), false);
+
+    Settings::configuring_global = false;
+
+    ui->setupUi(this);
+    setFocusPolicy(Qt::ClickFocus);
+    setWindowTitle(tr("Properties"));
+
+    ui->addonsTab->SetTitleId(title_id);
+
+    scene = new QGraphicsScene;
+    ui->icon_view->setScene(scene);
+
+    LoadConfiguration();
+}
+
+ConfigurePerGame::~ConfigurePerGame() = default;
+
+void ConfigurePerGame::ApplyConfiguration() {
+    ui->addonsTab->ApplyConfiguration();
+    ui->generalTab->ApplyConfiguration();
+    ui->systemTab->ApplyConfiguration();
+    ui->graphicsTab->ApplyConfiguration();
+    ui->graphicsAdvancedTab->ApplyConfiguration();
+    ui->audioTab->ApplyConfiguration();
+
+    Settings::Apply();
+    Settings::LogSettings();
+
+    game_config->Save();
+}
+
+void ConfigurePerGame::changeEvent(QEvent* event) {
+    if (event->type() == QEvent::LanguageChange) {
+        RetranslateUI();
+    }
+
+    QDialog::changeEvent(event);
+}
+
+void ConfigurePerGame::RetranslateUI() {
+    ui->retranslateUi(this);
+}
+
+void ConfigurePerGame::LoadFromFile(FileSys::VirtualFile file) {
+    this->file = std::move(file);
+    LoadConfiguration();
+}
+
+void ConfigurePerGame::LoadConfiguration() {
+    if (file == nullptr) {
+        return;
+    }
+
+    ui->addonsTab->LoadFromFile(file);
+
+    ui->display_title_id->setText(
+        QStringLiteral("%1").arg(title_id, 16, 16, QLatin1Char{'0'}).toUpper());
+
+    FileSys::PatchManager pm{title_id};
+    const auto control = pm.GetControlMetadata();
+    const auto loader = Loader::GetLoader(file);
+
+    if (control.first != nullptr) {
+        ui->display_version->setText(QString::fromStdString(control.first->GetVersionString()));
+        ui->display_name->setText(QString::fromStdString(control.first->GetApplicationName()));
+        ui->display_developer->setText(QString::fromStdString(control.first->GetDeveloperName()));
+    } else {
+        std::string title;
+        if (loader->ReadTitle(title) == Loader::ResultStatus::Success)
+            ui->display_name->setText(QString::fromStdString(title));
+
+        FileSys::NACP nacp;
+        if (loader->ReadControlData(nacp) == Loader::ResultStatus::Success)
+            ui->display_developer->setText(QString::fromStdString(nacp.GetDeveloperName()));
+
+        ui->display_version->setText(QStringLiteral("1.0.0"));
+    }
+
+    if (control.second != nullptr) {
+        scene->clear();
+
+        QPixmap map;
+        const auto bytes = control.second->ReadAllBytes();
+        map.loadFromData(bytes.data(), static_cast<u32>(bytes.size()));
+
+        scene->addPixmap(map.scaled(ui->icon_view->width(), ui->icon_view->height(),
+                                    Qt::IgnoreAspectRatio, Qt::SmoothTransformation));
+    } else {
+        std::vector<u8> bytes;
+        if (loader->ReadIcon(bytes) == Loader::ResultStatus::Success) {
+            scene->clear();
+
+            QPixmap map;
+            map.loadFromData(bytes.data(), static_cast<u32>(bytes.size()));
+
+            scene->addPixmap(map.scaled(ui->icon_view->width(), ui->icon_view->height(),
+                                        Qt::IgnoreAspectRatio, Qt::SmoothTransformation));
+        }
+    }
+
+    ui->display_filename->setText(QString::fromStdString(file->GetName()));
+
+    ui->display_format->setText(
+        QString::fromStdString(Loader::GetFileTypeString(loader->GetFileType())));
+
+    const auto valueText = ReadableByteSize(file->GetSize());
+    ui->display_size->setText(valueText);
+}
diff --git a/src/yuzu/configuration/configure_per_game.h b/src/yuzu/configuration/configure_per_game.h
new file mode 100644
index 000000000..5f9a08cef
--- /dev/null
+++ b/src/yuzu/configuration/configure_per_game.h
@@ -0,0 +1,51 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <QDialog>
+#include <QList>
+
+#include "core/file_sys/vfs_types.h"
+#include "yuzu/configuration/config.h"
+
+class QGraphicsScene;
+class QStandardItem;
+class QStandardItemModel;
+class QTreeView;
+class QVBoxLayout;
+
+namespace Ui {
+class ConfigurePerGame;
+}
+
+class ConfigurePerGame : public QDialog {
+    Q_OBJECT
+
+public:
+    explicit ConfigurePerGame(QWidget* parent, u64 title_id);
+    ~ConfigurePerGame() override;
+
+    /// Save all button configurations to settings file
+    void ApplyConfiguration();
+
+    void LoadFromFile(FileSys::VirtualFile file);
+
+private:
+    void changeEvent(QEvent* event) override;
+    void RetranslateUI();
+
+    void LoadConfiguration();
+
+    std::unique_ptr<Ui::ConfigurePerGame> ui;
+    FileSys::VirtualFile file;
+    u64 title_id;
+
+    QGraphicsScene* scene;
+
+    std::unique_ptr<Config> game_config;
+};
diff --git a/src/yuzu/configuration/configure_per_game.ui b/src/yuzu/configuration/configure_per_game.ui
new file mode 100644
index 000000000..d2057c4ab
--- /dev/null
+++ b/src/yuzu/configuration/configure_per_game.ui
@@ -0,0 +1,350 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>ConfigurePerGame</class>
+ <widget class="QDialog" name="ConfigurePerGame">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>800</width>
+    <height>600</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Dialog</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout_3">
+   <item>
+    <layout class="QHBoxLayout" name="horizontalLayout">
+     <item>
+      <widget class="QGroupBox" name="groupBox">
+       <property name="sizePolicy">
+        <sizepolicy hsizetype="Maximum" vsizetype="Preferred">
+         <horstretch>0</horstretch>
+         <verstretch>0</verstretch>
+        </sizepolicy>
+       </property>
+       <property name="title">
+        <string>Info</string>
+       </property>
+       <layout class="QVBoxLayout" name="verticalLayout">
+        <item alignment="Qt::AlignHCenter">
+         <widget class="QGraphicsView" name="icon_view">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Maximum" vsizetype="Maximum">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="minimumSize">
+           <size>
+            <width>256</width>
+            <height>256</height>
+           </size>
+          </property>
+          <property name="maximumSize">
+           <size>
+            <width>256</width>
+            <height>256</height>
+           </size>
+          </property>
+          <property name="verticalScrollBarPolicy">
+           <enum>Qt::ScrollBarAlwaysOff</enum>
+          </property>
+          <property name="horizontalScrollBarPolicy">
+           <enum>Qt::ScrollBarAlwaysOff</enum>
+          </property>
+          <property name="interactive">
+           <bool>false</bool>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <layout class="QGridLayout" name="gridLayout_2">
+          <item row="6" column="1">
+           <widget class="QLineEdit" name="display_size">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="1">
+           <widget class="QLineEdit" name="display_version">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>Name</string>
+            </property>
+           </widget>
+          </item>
+          <item row="4" column="0">
+           <widget class="QLabel" name="label_4">
+            <property name="text">
+             <string>Title ID</string>
+            </property>
+           </widget>
+          </item>
+          <item row="4" column="1">
+           <widget class="QLineEdit" name="display_title_id">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="7" column="1">
+           <widget class="QLineEdit" name="display_filename">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="5" column="1">
+           <widget class="QLineEdit" name="display_format">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="7" column="0">
+           <widget class="QLabel" name="label_7">
+            <property name="text">
+             <string>Filename</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="display_name">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="display_developer">
+            <property name="enabled">
+             <bool>true</bool>
+            </property>
+            <property name="readOnly">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="5" column="0">
+           <widget class="QLabel" name="label_5">
+            <property name="text">
+             <string>Format</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0">
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>Version</string>
+            </property>
+           </widget>
+          </item>
+          <item row="6" column="0">
+           <widget class="QLabel" name="label_6">
+            <property name="text">
+             <string>Size</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>Developer</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </item>
+        <item>
+         <spacer name="verticalSpacer">
+          <property name="orientation">
+           <enum>Qt::Vertical</enum>
+          </property>
+          <property name="sizeHint" stdset="0">
+           <size>
+            <width>20</width>
+            <height>40</height>
+           </size>
+          </property>
+         </spacer>
+        </item>
+       </layout>
+      </widget>
+     </item>
+     <item>
+      <layout class="QVBoxLayout" name="VerticalLayout">
+       <item>
+        <layout class="QVBoxLayout" name="verticalLayout_2"/>
+       </item>
+       <item>
+        <widget class="QTabWidget" name="tabWidget">
+         <property name="enabled">
+          <bool>true</bool>
+         </property>
+         <property name="currentIndex">
+          <number>0</number>
+         </property>
+         <property name="usesScrollButtons">
+          <bool>true</bool>
+         </property>
+         <property name="documentMode">
+          <bool>false</bool>
+         </property>
+         <property name="tabsClosable">
+          <bool>false</bool>
+         </property>
+         <widget class="ConfigurePerGameAddons" name="addonsTab">
+          <attribute name="title">
+           <string>Add-Ons</string>
+          </attribute>
+         </widget>
+         <widget class="ConfigureGeneral" name="generalTab">
+          <attribute name="title">
+           <string>General</string>
+          </attribute>
+         </widget>
+         <widget class="ConfigureSystem" name="systemTab">
+          <attribute name="title">
+           <string>System</string>
+          </attribute>
+         </widget>
+         <widget class="ConfigureGraphics" name="graphicsTab">
+          <attribute name="title">
+           <string>Graphics</string>
+          </attribute>
+         </widget>
+         <widget class="ConfigureGraphicsAdvanced" name="graphicsAdvancedTab">
+          <attribute name="title">
+           <string>Adv. Graphics</string>
+          </attribute>
+         </widget>
+         <widget class="ConfigureAudio" name="audioTab">
+          <attribute name="title">
+           <string>Audio</string>
+          </attribute>
+         </widget>
+        </widget>
+       </item>
+      </layout>
+     </item>
+    </layout>
+   </item>
+   <item>
+    <widget class="QDialogButtonBox" name="buttonBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="standardButtons">
+      <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
+     </property>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>ConfigureGeneral</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_general.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
+   <class>ConfigureSystem</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_system.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
+   <class>ConfigureAudio</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_audio.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
+   <class>ConfigureGraphics</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_graphics.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
+   <class>ConfigureGraphicsAdvanced</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_graphics_advanced.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
+   <class>ConfigurePerGameAddons</class>
+   <extends>QWidget</extends>
+   <header>configuration/configure_per_game_addons.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>accepted()</signal>
+   <receiver>ConfigurePerGame</receiver>
+   <slot>accept()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>248</x>
+     <y>254</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>157</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>rejected()</signal>
+   <receiver>ConfigurePerGame</receiver>
+   <slot>reject()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>316</x>
+     <y>260</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>286</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>
diff --git a/src/yuzu/configuration/configure_per_general.cpp b/src/yuzu/configuration/configure_per_game_addons.cpp
index d7f259f12..478d5d3a1 100644
--- a/src/yuzu/configuration/configure_per_general.cpp
+++ b/src/yuzu/configuration/configure_per_game_addons.cpp
@@ -15,23 +15,20 @@
 
 #include "common/common_paths.h"
 #include "common/file_util.h"
-#include "core/file_sys/control_metadata.h"
+#include "core/core.h"
 #include "core/file_sys/patch_manager.h"
 #include "core/file_sys/xts_archive.h"
 #include "core/loader/loader.h"
-#include "ui_configure_per_general.h"
+#include "ui_configure_per_game_addons.h"
 #include "yuzu/configuration/config.h"
 #include "yuzu/configuration/configure_input.h"
-#include "yuzu/configuration/configure_per_general.h"
+#include "yuzu/configuration/configure_per_game_addons.h"
 #include "yuzu/uisettings.h"
 #include "yuzu/util/util.h"
 
-ConfigurePerGameGeneral::ConfigurePerGameGeneral(QWidget* parent, u64 title_id)
-    : QDialog(parent), ui(std::make_unique<Ui::ConfigurePerGameGeneral>()), title_id(title_id) {
-
+ConfigurePerGameAddons::ConfigurePerGameAddons(QWidget* parent)
+    : QWidget(parent), ui(new Ui::ConfigurePerGameAddons) {
     ui->setupUi(this);
-    setFocusPolicy(Qt::ClickFocus);
-    setWindowTitle(tr("Properties"));
 
     layout = new QVBoxLayout;
     tree_view = new QTreeView;
@@ -52,7 +49,7 @@ ConfigurePerGameGeneral::ConfigurePerGameGeneral(QWidget* parent, u64 title_id)
     item_model->setHeaderData(1, Qt::Horizontal, tr("Version"));
 
     // We must register all custom types with the Qt Automoc system so that we are able to use it
-    // with signals/slots. In this case, QList falls under the umbrells of custom types.
+    // with signals/slots. In this case, QList falls under the umbrella of custom types.
     qRegisterMetaType<QList<QStandardItem*>>("QList<QStandardItem*>");
 
     layout->setContentsMargins(0, 0, 0, 0);
@@ -61,18 +58,15 @@ ConfigurePerGameGeneral::ConfigurePerGameGeneral(QWidget* parent, u64 title_id)
 
     ui->scrollArea->setLayout(layout);
 
-    scene = new QGraphicsScene;
-    ui->icon_view->setScene(scene);
+    ui->scrollArea->setEnabled(!Core::System::GetInstance().IsPoweredOn());
 
     connect(item_model, &QStandardItemModel::itemChanged,
             [] { UISettings::values.is_game_list_reload_pending.exchange(true); });
-
-    LoadConfiguration();
 }
 
-ConfigurePerGameGeneral::~ConfigurePerGameGeneral() = default;
+ConfigurePerGameAddons::~ConfigurePerGameAddons() = default;
 
-void ConfigurePerGameGeneral::ApplyConfiguration() {
+void ConfigurePerGameAddons::ApplyConfiguration() {
     std::vector<std::string> disabled_addons;
 
     for (const auto& item : list_items) {
@@ -92,72 +86,35 @@ void ConfigurePerGameGeneral::ApplyConfiguration() {
     Settings::values.disabled_addons[title_id] = disabled_addons;
 }
 
-void ConfigurePerGameGeneral::changeEvent(QEvent* event) {
+void ConfigurePerGameAddons::LoadFromFile(FileSys::VirtualFile file) {
+    this->file = std::move(file);
+    LoadConfiguration();
+}
+
+void ConfigurePerGameAddons::SetTitleId(u64 id) {
+    this->title_id = id;
+}
+
+void ConfigurePerGameAddons::changeEvent(QEvent* event) {
     if (event->type() == QEvent::LanguageChange) {
         RetranslateUI();
     }
 
-    QDialog::changeEvent(event);
+    QWidget::changeEvent(event);
 }
 
-void ConfigurePerGameGeneral::RetranslateUI() {
+void ConfigurePerGameAddons::RetranslateUI() {
     ui->retranslateUi(this);
 }
 
-void ConfigurePerGameGeneral::LoadFromFile(FileSys::VirtualFile file) {
-    this->file = std::move(file);
-    LoadConfiguration();
-}
-
-void ConfigurePerGameGeneral::LoadConfiguration() {
+void ConfigurePerGameAddons::LoadConfiguration() {
     if (file == nullptr) {
         return;
     }
 
-    ui->display_title_id->setText(QString::fromStdString(fmt::format("{:016X}", title_id)));
-
     FileSys::PatchManager pm{title_id};
-    const auto control = pm.GetControlMetadata();
     const auto loader = Loader::GetLoader(file);
 
-    if (control.first != nullptr) {
-        ui->display_version->setText(QString::fromStdString(control.first->GetVersionString()));
-        ui->display_name->setText(QString::fromStdString(control.first->GetApplicationName()));
-        ui->display_developer->setText(QString::fromStdString(control.first->GetDeveloperName()));
-    } else {
-        std::string title;
-        if (loader->ReadTitle(title) == Loader::ResultStatus::Success)
-            ui->display_name->setText(QString::fromStdString(title));
-
-        FileSys::NACP nacp;
-        if (loader->ReadControlData(nacp) == Loader::ResultStatus::Success)
-            ui->display_developer->setText(QString::fromStdString(nacp.GetDeveloperName()));
-
-        ui->display_version->setText(QStringLiteral("1.0.0"));
-    }
-
-    if (control.second != nullptr) {
-        scene->clear();
-
-        QPixmap map;
-        const auto bytes = control.second->ReadAllBytes();
-        map.loadFromData(bytes.data(), static_cast<u32>(bytes.size()));
-
-        scene->addPixmap(map.scaled(ui->icon_view->width(), ui->icon_view->height(),
-                                    Qt::IgnoreAspectRatio, Qt::SmoothTransformation));
-    } else {
-        std::vector<u8> bytes;
-        if (loader->ReadIcon(bytes) == Loader::ResultStatus::Success) {
-            scene->clear();
-
-            QPixmap map;
-            map.loadFromData(bytes.data(), static_cast<u32>(bytes.size()));
-
-            scene->addPixmap(map.scaled(ui->icon_view->width(), ui->icon_view->height(),
-                                        Qt::IgnoreAspectRatio, Qt::SmoothTransformation));
-        }
-    }
-
     FileSys::VirtualFile update_raw;
     loader->ReadUpdateRaw(update_raw);
 
@@ -182,12 +139,4 @@ void ConfigurePerGameGeneral::LoadConfiguration() {
     }
 
     tree_view->setColumnWidth(0, 5 * tree_view->width() / 16);
-
-    ui->display_filename->setText(QString::fromStdString(file->GetName()));
-
-    ui->display_format->setText(
-        QString::fromStdString(Loader::GetFileTypeString(loader->GetFileType())));
-
-    const auto valueText = ReadableByteSize(file->GetSize());
-    ui->display_size->setText(valueText);
 }
diff --git a/src/yuzu/configuration/configure_per_general.h b/src/yuzu/configuration/configure_per_game_addons.h
index a3b2cdeff..a00ec3539 100644
--- a/src/yuzu/configuration/configure_per_general.h
+++ b/src/yuzu/configuration/configure_per_game_addons.h
@@ -1,4 +1,4 @@
-// Copyright 2016 Citra Emulator Project
+// Copyright 2016 Citra Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -7,7 +7,6 @@
 #include <memory>
 #include <vector>
 
-#include <QDialog>
 #include <QList>
 
 #include "core/file_sys/vfs_types.h"
@@ -19,35 +18,36 @@ class QTreeView;
 class QVBoxLayout;
 
 namespace Ui {
-class ConfigurePerGameGeneral;
+class ConfigurePerGameAddons;
 }
 
-class ConfigurePerGameGeneral : public QDialog {
+class ConfigurePerGameAddons : public QWidget {
     Q_OBJECT
 
 public:
-    explicit ConfigurePerGameGeneral(QWidget* parent, u64 title_id);
-    ~ConfigurePerGameGeneral() override;
+    explicit ConfigurePerGameAddons(QWidget* parent = nullptr);
+    ~ConfigurePerGameAddons() override;
 
     /// Save all button configurations to settings file
     void ApplyConfiguration();
 
     void LoadFromFile(FileSys::VirtualFile file);
 
+    void SetTitleId(u64 id);
+
 private:
     void changeEvent(QEvent* event) override;
     void RetranslateUI();
 
     void LoadConfiguration();
 
-    std::unique_ptr<Ui::ConfigurePerGameGeneral> ui;
+    std::unique_ptr<Ui::ConfigurePerGameAddons> ui;
     FileSys::VirtualFile file;
     u64 title_id;
 
     QVBoxLayout* layout;
     QTreeView* tree_view;
     QStandardItemModel* item_model;
-    QGraphicsScene* scene;
 
     std::vector<QList<QStandardItem*>> list_items;
 };
diff --git a/src/yuzu/configuration/configure_per_game_addons.ui b/src/yuzu/configuration/configure_per_game_addons.ui
new file mode 100644
index 000000000..aefdebfcd
--- /dev/null
+++ b/src/yuzu/configuration/configure_per_game_addons.ui
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>ConfigurePerGameAddons</class>
+ <widget class="QWidget" name="ConfigurePerGameAddons">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>400</width>
+    <height>300</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QGridLayout" name="gridLayout">
+   <item row="0" column="0">
+    <widget class="QScrollArea" name="scrollArea">
+     <property name="widgetResizable">
+      <bool>true</bool>
+     </property>
+     <widget class="QWidget" name="scrollAreaWidgetContents">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>380</width>
+        <height>280</height>
+       </rect>
+      </property>
+     </widget>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/src/yuzu/configuration/configure_per_general.ui b/src/yuzu/configuration/configure_per_general.ui
deleted file mode 100644
index 8fdd96fa4..000000000
--- a/src/yuzu/configuration/configure_per_general.ui
+++ /dev/null
@@ -1,276 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<ui version="4.0">
- <class>ConfigurePerGameGeneral</class>
- <widget class="QDialog" name="ConfigurePerGameGeneral">
-  <property name="geometry">
-   <rect>
-    <x>0</x>
-    <y>0</y>
-    <width>400</width>
-    <height>520</height>
-   </rect>
-  </property>
-  <property name="windowTitle">
-   <string>ConfigurePerGameGeneral</string>
-  </property>
-  <layout class="QHBoxLayout" name="HorizontalLayout">
-   <item>
-    <layout class="QVBoxLayout" name="VerticalLayout">
-     <item>
-      <widget class="QGroupBox" name="GeneralGroupBox">
-       <property name="title">
-        <string>Info</string>
-       </property>
-       <layout class="QHBoxLayout" name="GeneralHorizontalLayout">
-        <item>
-         <layout class="QGridLayout" name="gridLayout_2">
-          <item row="6" column="1" colspan="2">
-           <widget class="QLineEdit" name="display_filename">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="0" column="1">
-           <widget class="QLineEdit" name="display_name">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="1" column="0">
-           <widget class="QLabel" name="label_2">
-            <property name="text">
-             <string>Developer</string>
-            </property>
-           </widget>
-          </item>
-          <item row="5" column="1" colspan="2">
-           <widget class="QLineEdit" name="display_size">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="0" column="0">
-           <widget class="QLabel" name="label">
-            <property name="text">
-             <string>Name</string>
-            </property>
-           </widget>
-          </item>
-          <item row="6" column="0">
-           <widget class="QLabel" name="label_7">
-            <property name="text">
-             <string>Filename</string>
-            </property>
-           </widget>
-          </item>
-          <item row="2" column="0">
-           <widget class="QLabel" name="label_3">
-            <property name="text">
-             <string>Version</string>
-            </property>
-           </widget>
-          </item>
-          <item row="4" column="0">
-           <widget class="QLabel" name="label_5">
-            <property name="text">
-             <string>Format</string>
-            </property>
-           </widget>
-          </item>
-          <item row="2" column="1">
-           <widget class="QLineEdit" name="display_version">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="4" column="1">
-           <widget class="QLineEdit" name="display_format">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="5" column="0">
-           <widget class="QLabel" name="label_6">
-            <property name="text">
-             <string>Size</string>
-            </property>
-           </widget>
-          </item>
-          <item row="1" column="1">
-           <widget class="QLineEdit" name="display_developer">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="3" column="0">
-           <widget class="QLabel" name="label_4">
-            <property name="text">
-             <string>Title ID</string>
-            </property>
-           </widget>
-          </item>
-          <item row="3" column="1">
-           <widget class="QLineEdit" name="display_title_id">
-            <property name="enabled">
-             <bool>true</bool>
-            </property>
-            <property name="readOnly">
-             <bool>true</bool>
-            </property>
-           </widget>
-          </item>
-          <item row="0" column="2" rowspan="5">
-           <widget class="QGraphicsView" name="icon_view">
-            <property name="sizePolicy">
-             <sizepolicy hsizetype="Maximum" vsizetype="Maximum">
-              <horstretch>0</horstretch>
-              <verstretch>0</verstretch>
-             </sizepolicy>
-            </property>
-            <property name="minimumSize">
-             <size>
-              <width>128</width>
-              <height>128</height>
-             </size>
-            </property>
-            <property name="maximumSize">
-             <size>
-              <width>128</width>
-              <height>128</height>
-             </size>
-            </property>
-            <property name="verticalScrollBarPolicy">
-             <enum>Qt::ScrollBarAlwaysOff</enum>
-            </property>
-            <property name="horizontalScrollBarPolicy">
-             <enum>Qt::ScrollBarAlwaysOff</enum>
-            </property>
-            <property name="sizeAdjustPolicy">
-             <enum>QAbstractScrollArea::AdjustToContents</enum>
-            </property>
-            <property name="interactive">
-             <bool>false</bool>
-            </property>
-           </widget>
-          </item>
-         </layout>
-        </item>
-       </layout>
-      </widget>
-     </item>
-     <item>
-      <widget class="QGroupBox" name="PerformanceGroupBox">
-       <property name="title">
-        <string>Add-Ons</string>
-       </property>
-       <layout class="QHBoxLayout" name="PerformanceHorizontalLayout">
-        <item>
-         <widget class="QScrollArea" name="scrollArea">
-          <property name="widgetResizable">
-           <bool>true</bool>
-          </property>
-          <widget class="QWidget" name="scrollAreaWidgetContents">
-           <property name="geometry">
-            <rect>
-             <x>0</x>
-             <y>0</y>
-             <width>350</width>
-             <height>169</height>
-            </rect>
-           </property>
-          </widget>
-         </widget>
-        </item>
-        <item>
-         <layout class="QVBoxLayout" name="PerformanceVerticalLayout"/>
-        </item>
-       </layout>
-      </widget>
-     </item>
-     <item>
-      <spacer name="verticalSpacer">
-       <property name="orientation">
-        <enum>Qt::Vertical</enum>
-       </property>
-       <property name="sizeType">
-        <enum>QSizePolicy::Fixed</enum>
-       </property>
-       <property name="sizeHint" stdset="0">
-        <size>
-         <width>20</width>
-         <height>40</height>
-        </size>
-       </property>
-      </spacer>
-     </item>
-     <item>
-      <widget class="QDialogButtonBox" name="buttonBox">
-       <property name="standardButtons">
-        <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
-       </property>
-      </widget>
-     </item>
-    </layout>
-   </item>
-  </layout>
- </widget>
- <resources/>
- <connections>
-  <connection>
-   <sender>buttonBox</sender>
-   <signal>accepted()</signal>
-   <receiver>ConfigurePerGameGeneral</receiver>
-   <slot>accept()</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>269</x>
-     <y>567</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>269</x>
-     <y>294</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>buttonBox</sender>
-   <signal>rejected()</signal>
-   <receiver>ConfigurePerGameGeneral</receiver>
-   <slot>reject()</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>269</x>
-     <y>567</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>269</x>
-     <y>294</y>
-    </hint>
-   </hints>
-  </connection>
- </connections>
-</ui>
diff --git a/src/yuzu/configuration/configure_service.cpp b/src/yuzu/configuration/configure_service.cpp
index 06566e981..0de7a4f0b 100644
--- a/src/yuzu/configuration/configure_service.cpp
+++ b/src/yuzu/configuration/configure_service.cpp
@@ -68,6 +68,7 @@ void ConfigureService::SetConfiguration() {
 }
 
 std::pair<QString, QString> ConfigureService::BCATDownloadEvents() {
+#ifdef YUZU_ENABLE_BOXCAT
     std::optional<std::string> global;
     std::map<std::string, Service::BCAT::EventStatus> map;
     const auto res = Service::BCAT::Boxcat::GetStatus(global, map);
@@ -105,7 +106,10 @@ std::pair<QString, QString> ConfigureService::BCATDownloadEvents() {
                    .arg(QString::fromStdString(key))
                    .arg(FormatEventStatusString(value));
     }
-    return {QStringLiteral("Current Boxcat Events"), std::move(out)};
+    return {tr("Current Boxcat Events"), std::move(out)};
+#else
+    return {tr("Current Boxcat Events"), tr("There are currently no events on boxcat.")};
+#endif
 }
 
 void ConfigureService::OnBCATImplChanged() {
diff --git a/src/yuzu/configuration/configure_system.cpp b/src/yuzu/configuration/configure_system.cpp
index 10315e7a6..68e02738b 100644
--- a/src/yuzu/configuration/configure_system.cpp
+++ b/src/yuzu/configuration/configure_system.cpp
@@ -14,6 +14,7 @@
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_system.h"
+#include "yuzu/configuration/configuration_shared.h"
 #include "yuzu/configuration/configure_system.h"
 
 ConfigureSystem::ConfigureSystem(QWidget* parent) : QWidget(parent), ui(new Ui::ConfigureSystem) {
@@ -21,20 +22,25 @@ ConfigureSystem::ConfigureSystem(QWidget* parent) : QWidget(parent), ui(new Ui::
     connect(ui->button_regenerate_console_id, &QPushButton::clicked, this,
             &ConfigureSystem::RefreshConsoleID);
 
-    connect(ui->rng_seed_checkbox, &QCheckBox::stateChanged, this, [this](bool checked) {
-        ui->rng_seed_edit->setEnabled(checked);
-        if (!checked) {
+    connect(ui->rng_seed_checkbox, &QCheckBox::stateChanged, this, [this](int state) {
+        ui->rng_seed_edit->setEnabled(state == Qt::Checked);
+        if (state != Qt::Checked) {
             ui->rng_seed_edit->setText(QStringLiteral("00000000"));
         }
     });
 
-    connect(ui->custom_rtc_checkbox, &QCheckBox::stateChanged, this, [this](bool checked) {
-        ui->custom_rtc_edit->setEnabled(checked);
-        if (!checked) {
+    connect(ui->custom_rtc_checkbox, &QCheckBox::stateChanged, this, [this](int state) {
+        ui->custom_rtc_edit->setEnabled(state == Qt::Checked);
+        if (state != Qt::Checked) {
             ui->custom_rtc_edit->setDateTime(QDateTime::currentDateTime());
         }
     });
 
+    ui->label_console_id->setVisible(Settings::configuring_global);
+    ui->button_regenerate_console_id->setVisible(Settings::configuring_global);
+
+    SetupPerGameUI();
+
     SetConfiguration();
 }
 
@@ -54,26 +60,58 @@ void ConfigureSystem::RetranslateUI() {
 
 void ConfigureSystem::SetConfiguration() {
     enabled = !Core::System::GetInstance().IsPoweredOn();
+    const auto rng_seed =
+        QStringLiteral("%1")
+            .arg(Settings::values.rng_seed.GetValue().value_or(0), 8, 16, QLatin1Char{'0'})
+            .toUpper();
+    const auto rtc_time = Settings::values.custom_rtc.GetValue().value_or(
+        std::chrono::seconds(QDateTime::currentSecsSinceEpoch()));
 
-    ui->combo_language->setCurrentIndex(Settings::values.language_index);
-    ui->combo_region->setCurrentIndex(Settings::values.region_index);
-    ui->combo_time_zone->setCurrentIndex(Settings::values.time_zone_index);
-    ui->combo_sound->setCurrentIndex(Settings::values.sound_index);
-
-    ui->rng_seed_checkbox->setChecked(Settings::values.rng_seed.has_value());
-    ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.has_value());
-
-    const auto rng_seed = QStringLiteral("%1")
-                              .arg(Settings::values.rng_seed.value_or(0), 8, 16, QLatin1Char{'0'})
-                              .toUpper();
-    ui->rng_seed_edit->setText(rng_seed);
-
-    ui->custom_rtc_checkbox->setChecked(Settings::values.custom_rtc.has_value());
-    ui->custom_rtc_edit->setEnabled(Settings::values.custom_rtc.has_value());
+    if (Settings::configuring_global) {
+        ui->combo_language->setCurrentIndex(Settings::values.language_index.GetValue());
+        ui->combo_region->setCurrentIndex(Settings::values.region_index.GetValue());
+        ui->combo_time_zone->setCurrentIndex(Settings::values.time_zone_index.GetValue());
+        ui->combo_sound->setCurrentIndex(Settings::values.sound_index.GetValue());
+
+        ui->rng_seed_checkbox->setChecked(Settings::values.rng_seed.GetValue().has_value());
+        ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.GetValue().has_value() &&
+                                      Settings::values.rng_seed.UsingGlobal());
+        ui->rng_seed_edit->setText(rng_seed);
+
+        ui->custom_rtc_checkbox->setChecked(Settings::values.custom_rtc.GetValue().has_value());
+        ui->custom_rtc_edit->setEnabled(Settings::values.custom_rtc.GetValue().has_value() &&
+                                        Settings::values.rng_seed.UsingGlobal());
+        ui->custom_rtc_edit->setDateTime(QDateTime::fromSecsSinceEpoch(rtc_time.count()));
+    } else {
+        ConfigurationShared::SetPerGameSetting(ui->combo_language,
+                                               &Settings::values.language_index);
+        ConfigurationShared::SetPerGameSetting(ui->combo_region, &Settings::values.region_index);
+        ConfigurationShared::SetPerGameSetting(ui->combo_time_zone,
+                                               &Settings::values.time_zone_index);
+        ConfigurationShared::SetPerGameSetting(ui->combo_sound, &Settings::values.sound_index);
+
+        if (Settings::values.rng_seed.UsingGlobal()) {
+            ui->rng_seed_checkbox->setCheckState(Qt::PartiallyChecked);
+        } else {
+            ui->rng_seed_checkbox->setCheckState(
+                Settings::values.rng_seed.GetValue().has_value() ? Qt::Checked : Qt::Unchecked);
+            ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.GetValue().has_value());
+            if (Settings::values.rng_seed.GetValue().has_value()) {
+                ui->rng_seed_edit->setText(rng_seed);
+            }
+        }
 
-    const auto rtc_time = Settings::values.custom_rtc.value_or(
-        std::chrono::seconds(QDateTime::currentSecsSinceEpoch()));
-    ui->custom_rtc_edit->setDateTime(QDateTime::fromSecsSinceEpoch(rtc_time.count()));
+        if (Settings::values.custom_rtc.UsingGlobal()) {
+            ui->custom_rtc_checkbox->setCheckState(Qt::PartiallyChecked);
+        } else {
+            ui->custom_rtc_checkbox->setCheckState(
+                Settings::values.custom_rtc.GetValue().has_value() ? Qt::Checked : Qt::Unchecked);
+            ui->custom_rtc_edit->setEnabled(Settings::values.custom_rtc.GetValue().has_value());
+            if (Settings::values.custom_rtc.GetValue().has_value()) {
+                ui->custom_rtc_edit->setDateTime(QDateTime::fromSecsSinceEpoch(rtc_time.count()));
+            }
+        }
+    }
 }
 
 void ConfigureSystem::ReadSystemSettings() {}
@@ -83,22 +121,78 @@ void ConfigureSystem::ApplyConfiguration() {
         return;
     }
 
-    Settings::values.language_index = ui->combo_language->currentIndex();
-    Settings::values.region_index = ui->combo_region->currentIndex();
-    Settings::values.time_zone_index = ui->combo_time_zone->currentIndex();
-    Settings::values.sound_index = ui->combo_sound->currentIndex();
+    if (Settings::configuring_global) {
+        // Guard if during game and set to game-specific value
+        if (Settings::values.language_index.UsingGlobal()) {
+            Settings::values.language_index.SetValue(ui->combo_language->currentIndex());
+        }
+        if (Settings::values.region_index.UsingGlobal()) {
+            Settings::values.region_index.SetValue(ui->combo_region->currentIndex());
+        }
+        if (Settings::values.time_zone_index.UsingGlobal()) {
+            Settings::values.time_zone_index.SetValue(ui->combo_time_zone->currentIndex());
+        }
+        if (Settings::values.sound_index.UsingGlobal()) {
+            Settings::values.sound_index.SetValue(ui->combo_sound->currentIndex());
+        }
+
+        if (Settings::values.rng_seed.UsingGlobal()) {
+            if (ui->rng_seed_checkbox->isChecked()) {
+                Settings::values.rng_seed.SetValue(
+                    ui->rng_seed_edit->text().toULongLong(nullptr, 16));
+            } else {
+                Settings::values.rng_seed.SetValue(std::nullopt);
+            }
+        }
 
-    if (ui->rng_seed_checkbox->isChecked()) {
-        Settings::values.rng_seed = ui->rng_seed_edit->text().toULongLong(nullptr, 16);
+        if (Settings::values.custom_rtc.UsingGlobal()) {
+            if (ui->custom_rtc_checkbox->isChecked()) {
+                Settings::values.custom_rtc.SetValue(
+                    std::chrono::seconds(ui->custom_rtc_edit->dateTime().toSecsSinceEpoch()));
+            } else {
+                Settings::values.custom_rtc.SetValue(std::nullopt);
+            }
+        }
     } else {
-        Settings::values.rng_seed = std::nullopt;
-    }
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.language_index,
+                                                 ui->combo_language);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.region_index, ui->combo_region);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.time_zone_index,
+                                                 ui->combo_time_zone);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.sound_index, ui->combo_sound);
+
+        switch (ui->rng_seed_checkbox->checkState()) {
+        case Qt::Checked:
+            Settings::values.rng_seed.SetGlobal(false);
+            Settings::values.rng_seed.SetValue(ui->rng_seed_edit->text().toULongLong(nullptr, 16));
+            break;
+        case Qt::Unchecked:
+            Settings::values.rng_seed.SetGlobal(false);
+            Settings::values.rng_seed.SetValue(std::nullopt);
+            break;
+        case Qt::PartiallyChecked:
+            Settings::values.rng_seed.SetGlobal(false);
+            Settings::values.rng_seed.SetValue(std::nullopt);
+            Settings::values.rng_seed.SetGlobal(true);
+            break;
+        }
 
-    if (ui->custom_rtc_checkbox->isChecked()) {
-        Settings::values.custom_rtc =
-            std::chrono::seconds(ui->custom_rtc_edit->dateTime().toSecsSinceEpoch());
-    } else {
-        Settings::values.custom_rtc = std::nullopt;
+        switch (ui->custom_rtc_checkbox->checkState()) {
+        case Qt::Checked:
+            Settings::values.custom_rtc.SetGlobal(false);
+            Settings::values.custom_rtc.SetValue(
+                std::chrono::seconds(ui->custom_rtc_edit->dateTime().toSecsSinceEpoch()));
+            break;
+        case Qt::Unchecked:
+            Settings::values.custom_rtc.SetGlobal(false);
+            Settings::values.custom_rtc.SetValue(std::nullopt);
+            break;
+        case Qt::PartiallyChecked:
+            Settings::values.custom_rtc.SetGlobal(false);
+            Settings::values.custom_rtc.SetValue(std::nullopt);
+            Settings::values.custom_rtc.SetGlobal(true);
+            break;
+        }
     }
 
     Settings::Apply();
@@ -120,3 +214,25 @@ void ConfigureSystem::RefreshConsoleID() {
     ui->label_console_id->setText(
         tr("Console ID: 0x%1").arg(QString::number(console_id, 16).toUpper()));
 }
+
+void ConfigureSystem::SetupPerGameUI() {
+    if (Settings::configuring_global) {
+        ui->combo_language->setEnabled(Settings::values.language_index.UsingGlobal());
+        ui->combo_region->setEnabled(Settings::values.region_index.UsingGlobal());
+        ui->combo_time_zone->setEnabled(Settings::values.time_zone_index.UsingGlobal());
+        ui->combo_sound->setEnabled(Settings::values.sound_index.UsingGlobal());
+        ui->rng_seed_checkbox->setEnabled(Settings::values.rng_seed.UsingGlobal());
+        ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.UsingGlobal());
+        ui->custom_rtc_checkbox->setEnabled(Settings::values.custom_rtc.UsingGlobal());
+        ui->custom_rtc_edit->setEnabled(Settings::values.custom_rtc.UsingGlobal());
+
+        return;
+    }
+
+    ConfigurationShared::InsertGlobalItem(ui->combo_language);
+    ConfigurationShared::InsertGlobalItem(ui->combo_region);
+    ConfigurationShared::InsertGlobalItem(ui->combo_time_zone);
+    ConfigurationShared::InsertGlobalItem(ui->combo_sound);
+    ui->rng_seed_checkbox->setTristate(true);
+    ui->custom_rtc_checkbox->setTristate(true);
+}
diff --git a/src/yuzu/configuration/configure_system.h b/src/yuzu/configuration/configure_system.h
index 26d42d5c5..f317ef8b5 100644
--- a/src/yuzu/configuration/configure_system.h
+++ b/src/yuzu/configuration/configure_system.h
@@ -32,6 +32,8 @@ private:
 
     void RefreshConsoleID();
 
+    void SetupPerGameUI();
+
     std::unique_ptr<Ui::ConfigureSystem> ui;
     bool enabled = false;
 
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index c1ea25fb8..9bb0a0109 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -2,10 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <fmt/format.h>
+
 #include "yuzu/debugger/wait_tree.h"
 #include "yuzu/util/util.h"
 
 #include "common/assert.h"
+#include "core/arm/arm_interface.h"
 #include "core/core.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/mutex.h"
@@ -59,8 +62,10 @@ std::vector<std::unique_ptr<WaitTreeThread>> WaitTreeItem::MakeThreadItemList()
     std::size_t row = 0;
     auto add_threads = [&](const std::vector<std::shared_ptr<Kernel::Thread>>& threads) {
         for (std::size_t i = 0; i < threads.size(); ++i) {
-            item_list.push_back(std::make_unique<WaitTreeThread>(*threads[i]));
-            item_list.back()->row = row;
+            if (!threads[i]->IsHLEThread()) {
+                item_list.push_back(std::make_unique<WaitTreeThread>(*threads[i]));
+                item_list.back()->row = row;
+            }
             ++row;
         }
     };
@@ -114,20 +119,21 @@ QString WaitTreeCallstack::GetText() const {
 std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeCallstack::GetChildren() const {
     std::vector<std::unique_ptr<WaitTreeItem>> list;
 
-    constexpr std::size_t BaseRegister = 29;
-    auto& memory = Core::System::GetInstance().Memory();
-    u64 base_pointer = thread.GetContext64().cpu_registers[BaseRegister];
+    if (thread.IsHLEThread()) {
+        return list;
+    }
 
-    while (base_pointer != 0) {
-        const u64 lr = memory.Read64(base_pointer + sizeof(u64));
-        if (lr == 0) {
-            break;
-        }
+    if (thread.GetOwnerProcess() == nullptr || !thread.GetOwnerProcess()->Is64BitProcess()) {
+        return list;
+    }
 
-        list.push_back(std::make_unique<WaitTreeText>(
-            tr("0x%1").arg(lr - sizeof(u32), 16, 16, QLatin1Char{'0'})));
+    auto backtrace = Core::ARM_Interface::GetBacktraceFromContext(Core::System::GetInstance(),
+                                                                  thread.GetContext64());
 
-        base_pointer = memory.Read64(base_pointer);
+    for (auto& entry : backtrace) {
+        std::string s = fmt::format("{:20}{:016X} {:016X} {:016X} {}", entry.module, entry.address,
+                                    entry.original_address, entry.offset, entry.name);
+        list.push_back(std::make_unique<WaitTreeText>(QString::fromStdString(s)));
     }
 
     return list;
@@ -206,7 +212,15 @@ QString WaitTreeThread::GetText() const {
         status = tr("running");
         break;
     case Kernel::ThreadStatus::Ready:
-        status = tr("ready");
+        if (!thread.IsPaused()) {
+            if (thread.WasRunning()) {
+                status = tr("running");
+            } else {
+                status = tr("ready");
+            }
+        } else {
+            status = tr("paused");
+        }
         break;
     case Kernel::ThreadStatus::Paused:
         status = tr("paused");
@@ -254,7 +268,15 @@ QColor WaitTreeThread::GetColor() const {
     case Kernel::ThreadStatus::Running:
         return QColor(Qt::GlobalColor::darkGreen);
     case Kernel::ThreadStatus::Ready:
-        return QColor(Qt::GlobalColor::darkBlue);
+        if (!thread.IsPaused()) {
+            if (thread.WasRunning()) {
+                return QColor(Qt::GlobalColor::darkGreen);
+            } else {
+                return QColor(Qt::GlobalColor::darkBlue);
+            }
+        } else {
+            return QColor(Qt::GlobalColor::lightGray);
+        }
     case Kernel::ThreadStatus::Paused:
         return QColor(Qt::GlobalColor::lightGray);
     case Kernel::ThreadStatus::WaitHLEEvent:
@@ -319,7 +341,7 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
 
     if (thread.GetStatus() == Kernel::ThreadStatus::WaitSynch) {
         list.push_back(std::make_unique<WaitTreeObjectList>(thread.GetSynchronizationObjects(),
-                                                            thread.IsSleepingOnWait()));
+                                                            thread.IsWaitingSync()));
     }
 
     list.push_back(std::make_unique<WaitTreeCallstack>(thread));
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 270cccc77..4d501a8f9 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -16,7 +16,7 @@
 #include "applets/software_keyboard.h"
 #include "applets/web_browser.h"
 #include "configuration/configure_input.h"
-#include "configuration/configure_per_general.h"
+#include "configuration/configure_per_game.h"
 #include "core/file_sys/vfs.h"
 #include "core/file_sys/vfs_real.h"
 #include "core/frontend/applets/general_frontend.h"
@@ -56,6 +56,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
 #include <QShortcut>
 #include <QStatusBar>
 #include <QSysInfo>
+#include <QUrl>
 #include <QtConcurrent/QtConcurrent>
 
 #include <fmt/format.h>
@@ -217,7 +218,20 @@ GMainWindow::GMainWindow()
     LOG_INFO(Frontend, "yuzu Version: {} | {}-{}", yuzu_build_version, Common::g_scm_branch,
              Common::g_scm_desc);
 #ifdef ARCHITECTURE_x86_64
-    LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
+    const auto& caps = Common::GetCPUCaps();
+    std::string cpu_string = caps.cpu_string;
+    if (caps.avx || caps.avx2 || caps.avx512) {
+        cpu_string += " | AVX";
+        if (caps.avx512) {
+            cpu_string += "512";
+        } else if (caps.avx2) {
+            cpu_string += '2';
+        }
+        if (caps.fma || caps.fma4) {
+            cpu_string += " | FMA";
+        }
+    }
+    LOG_INFO(Frontend, "Host CPU: {}", cpu_string);
 #endif
     LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
     LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
@@ -520,14 +534,36 @@ void GMainWindow::InitializeWidgets() {
         if (emulation_running) {
             return;
         }
-        Settings::values.use_asynchronous_gpu_emulation =
-            !Settings::values.use_asynchronous_gpu_emulation;
-        async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
+        bool is_async = !Settings::values.use_asynchronous_gpu_emulation.GetValue() ||
+                        Settings::values.use_multi_core.GetValue();
+        Settings::values.use_asynchronous_gpu_emulation.SetValue(is_async);
+        async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation.GetValue());
         Settings::Apply();
     });
     async_status_button->setText(tr("ASYNC"));
     async_status_button->setCheckable(true);
-    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
+    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation.GetValue());
+
+    // Setup Multicore button
+    multicore_status_button = new QPushButton();
+    multicore_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
+    multicore_status_button->setFocusPolicy(Qt::NoFocus);
+    connect(multicore_status_button, &QPushButton::clicked, [&] {
+        if (emulation_running) {
+            return;
+        }
+        Settings::values.use_multi_core.SetValue(!Settings::values.use_multi_core.GetValue());
+        bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue() ||
+                        Settings::values.use_multi_core.GetValue();
+        Settings::values.use_asynchronous_gpu_emulation.SetValue(is_async);
+        async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation.GetValue());
+        multicore_status_button->setChecked(Settings::values.use_multi_core.GetValue());
+        Settings::Apply();
+    });
+    multicore_status_button->setText(tr("MULTICORE"));
+    multicore_status_button->setCheckable(true);
+    multicore_status_button->setChecked(Settings::values.use_multi_core.GetValue());
+    statusBar()->insertPermanentWidget(0, multicore_status_button);
     statusBar()->insertPermanentWidget(0, async_status_button);
 
     // Setup Renderer API button
@@ -545,16 +581,16 @@ void GMainWindow::InitializeWidgets() {
     renderer_status_button->setCheckable(false);
     renderer_status_button->setDisabled(true);
 #else
-    renderer_status_button->setChecked(Settings::values.renderer_backend ==
+    renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() ==
                                        Settings::RendererBackend::Vulkan);
     connect(renderer_status_button, &QPushButton::clicked, [=] {
         if (emulation_running) {
             return;
         }
         if (renderer_status_button->isChecked()) {
-            Settings::values.renderer_backend = Settings::RendererBackend::Vulkan;
+            Settings::values.renderer_backend.SetValue(Settings::RendererBackend::Vulkan);
         } else {
-            Settings::values.renderer_backend = Settings::RendererBackend::OpenGL;
+            Settings::values.renderer_backend.SetValue(Settings::RendererBackend::OpenGL);
         }
 
         Settings::Apply();
@@ -653,6 +689,11 @@ void GMainWindow::InitializeHotkeys() {
     ui.action_Capture_Screenshot->setShortcutContext(
         hotkey_registry.GetShortcutContext(main_window, capture_screenshot));
 
+    ui.action_Fullscreen->setShortcut(
+        hotkey_registry.GetHotkey(main_window, fullscreen, this)->key());
+    ui.action_Fullscreen->setShortcutContext(
+        hotkey_registry.GetShortcutContext(main_window, fullscreen));
+
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Load File"), this),
             &QShortcut::activated, this, &GMainWindow::OnMenuLoadFile);
     connect(
@@ -686,24 +727,24 @@ void GMainWindow::InitializeHotkeys() {
             });
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Toggle Speed Limit"), this),
             &QShortcut::activated, this, [&] {
-                Settings::values.use_frame_limit = !Settings::values.use_frame_limit;
+                Settings::values.use_frame_limit.SetValue(
+                    !Settings::values.use_frame_limit.GetValue());
                 UpdateStatusBar();
             });
-    // TODO: Remove this comment/static whenever the next major release of
-    // MSVC occurs and we make it a requirement (see:
-    // https://developercommunity.visualstudio.com/content/problem/93922/constexprs-are-trying-to-be-captured-in-lambda-fun.html)
-    static constexpr u16 SPEED_LIMIT_STEP = 5;
+    constexpr u16 SPEED_LIMIT_STEP = 5;
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Increase Speed Limit"), this),
             &QShortcut::activated, this, [&] {
-                if (Settings::values.frame_limit < 9999 - SPEED_LIMIT_STEP) {
-                    Settings::values.frame_limit += SPEED_LIMIT_STEP;
+                if (Settings::values.frame_limit.GetValue() < 9999 - SPEED_LIMIT_STEP) {
+                    Settings::values.frame_limit.SetValue(SPEED_LIMIT_STEP +
+                                                          Settings::values.frame_limit.GetValue());
                     UpdateStatusBar();
                 }
             });
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Decrease Speed Limit"), this),
             &QShortcut::activated, this, [&] {
-                if (Settings::values.frame_limit > SPEED_LIMIT_STEP) {
-                    Settings::values.frame_limit -= SPEED_LIMIT_STEP;
+                if (Settings::values.frame_limit.GetValue() > SPEED_LIMIT_STEP) {
+                    Settings::values.frame_limit.SetValue(Settings::values.frame_limit.GetValue() -
+                                                          SPEED_LIMIT_STEP);
                     UpdateStatusBar();
                 }
             });
@@ -715,7 +756,7 @@ void GMainWindow::InitializeHotkeys() {
             });
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Capture Screenshot"), this),
             &QShortcut::activated, this, [&] {
-                if (emu_thread->IsRunning()) {
+                if (emu_thread != nullptr && emu_thread->IsRunning()) {
                     OnCaptureScreenshot();
                 }
             });
@@ -726,6 +767,9 @@ void GMainWindow::InitializeHotkeys() {
                                     Settings::values.use_docked_mode);
                 dock_status_button->setChecked(Settings::values.use_docked_mode);
             });
+    connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Mute Audio"), this),
+            &QShortcut::activated, this,
+            [] { Settings::values.audio_muted = !Settings::values.audio_muted; });
 }
 
 void GMainWindow::SetDefaultUIGeometry() {
@@ -826,6 +870,10 @@ void GMainWindow::ConnectMenuEvents() {
     connect(ui.action_Stop, &QAction::triggered, this, &GMainWindow::OnStopGame);
     connect(ui.action_Report_Compatibility, &QAction::triggered, this,
             &GMainWindow::OnMenuReportCompatibility);
+    connect(ui.action_Open_Mods_Page, &QAction::triggered, this, &GMainWindow::OnOpenModsPage);
+    connect(ui.action_Open_Quickstart_Guide, &QAction::triggered, this,
+            &GMainWindow::OnOpenQuickstartGuide);
+    connect(ui.action_Open_FAQ, &QAction::triggered, this, &GMainWindow::OnOpenFAQ);
     connect(ui.action_Restart, &QAction::triggered, this, [this] { BootGame(QString(game_path)); });
     connect(ui.action_Configure, &QAction::triggered, this, &GMainWindow::OnConfigure);
 
@@ -839,10 +887,6 @@ void GMainWindow::ConnectMenuEvents() {
     connect(ui.action_Reset_Window_Size, &QAction::triggered, this, &GMainWindow::ResetWindowSize);
 
     // Fullscreen
-    ui.action_Fullscreen->setShortcut(
-        hotkey_registry
-            .GetHotkey(QStringLiteral("Main Window"), QStringLiteral("Fullscreen"), this)
-            ->key());
     connect(ui.action_Fullscreen, &QAction::triggered, this, &GMainWindow::ToggleFullscreen);
 
     // Movie
@@ -910,6 +954,8 @@ bool GMainWindow::LoadROM(const QString& filename) {
         nullptr,                                     // E-Commerce
     });
 
+    system.RegisterHostThread();
+
     const Core::System::ResultStatus result{system.Load(*render_window, filename.toStdString())};
 
     const auto drd_callout =
@@ -996,6 +1042,17 @@ void GMainWindow::BootGame(const QString& filename) {
     LOG_INFO(Frontend, "yuzu starting...");
     StoreRecentFile(filename); // Put the filename on top of the list
 
+    u64 title_id{0};
+
+    const auto v_file = Core::GetGameFileFromPath(vfs, filename.toUtf8().constData());
+    const auto loader = Loader::GetLoader(v_file);
+    if (!(loader == nullptr || loader->ReadProgramId(title_id) != Loader::ResultStatus::Success)) {
+        // Load per game settings
+        Config per_game_config(fmt::format("{:016X}.ini", title_id), false);
+    }
+
+    Settings::LogSettings();
+
     if (UISettings::values.select_user_on_boot) {
         SelectAndSetCurrentUser();
     }
@@ -1020,12 +1077,14 @@ void GMainWindow::BootGame(const QString& filename) {
             &LoadingScreen::OnLoadProgress, Qt::QueuedConnection);
 
     // Update the GUI
+    UpdateStatusButtons();
     if (ui.action_Single_Window_Mode->isChecked()) {
         game_list->hide();
         game_list_placeholder->hide();
     }
     status_bar_update_timer.start(2000);
     async_status_button->setDisabled(true);
+    multicore_status_button->setDisabled(true);
     renderer_status_button->setDisabled(true);
 
     if (UISettings::values.hide_mouse) {
@@ -1034,20 +1093,20 @@ void GMainWindow::BootGame(const QString& filename) {
         ui.centralwidget->setMouseTracking(true);
     }
 
-    const u64 title_id = Core::System::GetInstance().CurrentProcess()->GetTitleID();
-
     std::string title_name;
+    std::string title_version;
     const auto res = Core::System::GetInstance().GetGameName(title_name);
-    if (res != Loader::ResultStatus::Success) {
-        const auto metadata = FileSys::PatchManager(title_id).GetControlMetadata();
-        if (metadata.first != nullptr)
-            title_name = metadata.first->GetApplicationName();
 
-        if (title_name.empty())
-            title_name = FileUtil::GetFilename(filename.toStdString());
+    const auto metadata = FileSys::PatchManager(title_id).GetControlMetadata();
+    if (metadata.first != nullptr) {
+        title_version = metadata.first->GetVersionString();
+        title_name = metadata.first->GetApplicationName();
+    }
+    if (res != Loader::ResultStatus::Success || title_name.empty()) {
+        title_name = FileUtil::GetFilename(filename.toStdString());
     }
-    LOG_INFO(Frontend, "Booting game: {:016X} | {}", title_id, title_name);
-    UpdateWindowTitle(QString::fromStdString(title_name));
+    LOG_INFO(Frontend, "Booting game: {:016X} | {} | {}", title_id, title_name, title_version);
+    UpdateWindowTitle(title_name, title_version);
 
     loading_screen->Prepare(Core::System::GetInstance().GetAppLoader());
     loading_screen->show();
@@ -1113,6 +1172,7 @@ void GMainWindow::ShutdownGame() {
     game_fps_label->setVisible(false);
     emu_frametime_label->setVisible(false);
     async_status_button->setEnabled(true);
+    multicore_status_button->setEnabled(true);
 #ifdef HAS_VULKAN
     renderer_status_button->setEnabled(true);
 #endif
@@ -1474,7 +1534,7 @@ void GMainWindow::OnGameListOpenPerGameProperties(const std::string& file) {
         return;
     }
 
-    ConfigurePerGameGeneral dialog(this, title_id);
+    ConfigurePerGame dialog(this, title_id);
     dialog.LoadFromFile(v_file);
     auto result = dialog.exec();
     if (result == QDialog::Accepted) {
@@ -1485,7 +1545,14 @@ void GMainWindow::OnGameListOpenPerGameProperties(const std::string& file) {
             game_list->PopulateAsync(UISettings::values.game_dirs);
         }
 
-        config->Save();
+        // Do not cause the global config to write local settings into the config file
+        Settings::RestoreGlobalState();
+
+        if (!Core::System::GetInstance().IsPoweredOn()) {
+            config->Save();
+        }
+    } else {
+        Settings::RestoreGlobalState();
     }
 }
 
@@ -1772,6 +1839,9 @@ void GMainWindow::OnStopGame() {
     }
 
     ShutdownGame();
+
+    Settings::RestoreGlobalState();
+    UpdateStatusButtons();
 }
 
 void GMainWindow::OnLoadComplete() {
@@ -1797,6 +1867,26 @@ void GMainWindow::OnMenuReportCompatibility() {
     }
 }
 
+void GMainWindow::OpenURL(const QUrl& url) {
+    const bool open = QDesktopServices::openUrl(url);
+    if (!open) {
+        QMessageBox::warning(this, tr("Error opening URL"),
+                             tr("Unable to open the URL \"%1\".").arg(url.toString()));
+    }
+}
+
+void GMainWindow::OnOpenModsPage() {
+    OpenURL(QUrl(QStringLiteral("https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods")));
+}
+
+void GMainWindow::OnOpenQuickstartGuide() {
+    OpenURL(QUrl(QStringLiteral("https://yuzu-emu.org/help/quickstart/")));
+}
+
+void GMainWindow::OnOpenFAQ() {
+    OpenURL(QUrl(QStringLiteral("https://yuzu-emu.org/wiki/faq/")));
+}
+
 void GMainWindow::ToggleFullscreen() {
     if (!emulation_running) {
         return;
@@ -1859,7 +1949,7 @@ void GMainWindow::ToggleWindowMode() {
 
 void GMainWindow::ResetWindowSize() {
     const auto aspect_ratio = Layout::EmulationAspectRatio(
-        static_cast<Layout::AspectRatio>(Settings::values.aspect_ratio),
+        static_cast<Layout::AspectRatio>(Settings::values.aspect_ratio.GetValue()),
         static_cast<float>(Layout::ScreenUndocked::Height) / Layout::ScreenUndocked::Width);
     if (!ui.action_Single_Window_Mode->isChecked()) {
         render_window->resize(Layout::ScreenUndocked::Height / aspect_ratio,
@@ -1907,12 +1997,7 @@ void GMainWindow::OnConfigure() {
         ui.centralwidget->setMouseTracking(false);
     }
 
-    dock_status_button->setChecked(Settings::values.use_docked_mode);
-    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
-#ifdef HAS_VULKAN
-    renderer_status_button->setChecked(Settings::values.renderer_backend ==
-                                       Settings::RendererBackend::Vulkan);
-#endif
+    UpdateStatusButtons();
 }
 
 void GMainWindow::OnLoadAmiibo() {
@@ -1995,7 +2080,8 @@ void GMainWindow::OnCaptureScreenshot() {
     OnStartGame();
 }
 
-void GMainWindow::UpdateWindowTitle(const QString& title_name) {
+void GMainWindow::UpdateWindowTitle(const std::string& title_name,
+                                    const std::string& title_version) {
     const auto full_name = std::string(Common::g_build_fullname);
     const auto branch_name = std::string(Common::g_scm_branch);
     const auto description = std::string(Common::g_scm_desc);
@@ -2004,7 +2090,7 @@ void GMainWindow::UpdateWindowTitle(const QString& title_name) {
     const auto date =
         QDateTime::currentDateTime().toString(QStringLiteral("yyyy-MM-dd")).toStdString();
 
-    if (title_name.isEmpty()) {
+    if (title_name.empty()) {
         const auto fmt = std::string(Common::g_title_bar_format_idle);
         setWindowTitle(QString::fromStdString(fmt::format(fmt.empty() ? "yuzu {0}| {1}-{2}" : fmt,
                                                           full_name, branch_name, description,
@@ -2012,8 +2098,8 @@ void GMainWindow::UpdateWindowTitle(const QString& title_name) {
     } else {
         const auto fmt = std::string(Common::g_title_bar_format_running);
         setWindowTitle(QString::fromStdString(
-            fmt::format(fmt.empty() ? "yuzu {0}| {3} | {1}-{2}" : fmt, full_name, branch_name,
-                        description, title_name.toStdString(), date, build_id)));
+            fmt::format(fmt.empty() ? "yuzu {0}| {3} | {6} | {1}-{2}" : fmt, full_name, branch_name,
+                        description, title_name, date, build_id, title_version)));
     }
 }
 
@@ -2025,21 +2111,34 @@ void GMainWindow::UpdateStatusBar() {
 
     auto results = Core::System::GetInstance().GetAndResetPerfStats();
 
-    if (Settings::values.use_frame_limit) {
+    if (Settings::values.use_frame_limit.GetValue()) {
         emu_speed_label->setText(tr("Speed: %1% / %2%")
                                      .arg(results.emulation_speed * 100.0, 0, 'f', 0)
-                                     .arg(Settings::values.frame_limit));
+                                     .arg(Settings::values.frame_limit.GetValue()));
     } else {
         emu_speed_label->setText(tr("Speed: %1%").arg(results.emulation_speed * 100.0, 0, 'f', 0));
     }
     game_fps_label->setText(tr("Game: %1 FPS").arg(results.game_fps, 0, 'f', 0));
     emu_frametime_label->setText(tr("Frame: %1 ms").arg(results.frametime * 1000.0, 0, 'f', 2));
 
-    emu_speed_label->setVisible(true);
+    emu_speed_label->setVisible(!Settings::values.use_multi_core.GetValue());
     game_fps_label->setVisible(true);
     emu_frametime_label->setVisible(true);
 }
 
+void GMainWindow::UpdateStatusButtons() {
+    dock_status_button->setChecked(Settings::values.use_docked_mode);
+    multicore_status_button->setChecked(Settings::values.use_multi_core.GetValue());
+    Settings::values.use_asynchronous_gpu_emulation.SetValue(
+        Settings::values.use_asynchronous_gpu_emulation.GetValue() ||
+        Settings::values.use_multi_core.GetValue());
+    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation.GetValue());
+#ifdef HAS_VULKAN
+    renderer_status_button->setChecked(Settings::values.renderer_backend.GetValue() ==
+                                       Settings::RendererBackend::Vulkan);
+#endif
+}
+
 void GMainWindow::HideMouseCursor() {
     if (emu_thread == nullptr || UISettings::values.hide_mouse == false) {
         mouse_hide_timer.stop();
@@ -2123,6 +2222,9 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
     if (answer == QMessageBox::Yes) {
         if (emu_thread) {
             ShutdownGame();
+
+            Settings::RestoreGlobalState();
+            UpdateStatusButtons();
         }
     } else {
         // Only show the message if the game is still running.
@@ -2154,7 +2256,7 @@ void GMainWindow::OnReinitializeKeys(ReinitializeKeyBehavior behavior) {
                          "title.keys_autogenerated");
     }
 
-    Core::Crypto::KeyManager keys{};
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
     if (keys.BaseDeriveNecessary()) {
         Core::Crypto::PartitionDataManager pdm{vfs->OpenDirectory(
             FileUtil::GetUserPath(FileUtil::UserPath::SysDataDir), FileSys::Mode::Read)};
@@ -2285,9 +2387,13 @@ void GMainWindow::closeEvent(QCloseEvent* event) {
     hotkey_registry.SaveHotkeys();
 
     // Shutdown session if the emu thread is active...
-    if (emu_thread != nullptr)
+    if (emu_thread != nullptr) {
         ShutdownGame();
 
+        Settings::RestoreGlobalState();
+        UpdateStatusButtons();
+    }
+
     render_window->close();
 
     QWidget::closeEvent(event);
@@ -2467,8 +2573,6 @@ int main(int argc, char* argv[]) {
     QObject::connect(&app, &QGuiApplication::applicationStateChanged, &main_window,
                      &GMainWindow::OnAppFocusStateChanged);
 
-    Settings::LogSettings();
-
     int result = app.exec();
     detached_tasks.WaitForAllTasks();
     return result;
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 4f4c8ddbe..8e3d39c38 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -181,6 +181,9 @@ private slots:
     void OnPauseGame();
     void OnStopGame();
     void OnMenuReportCompatibility();
+    void OnOpenModsPage();
+    void OnOpenQuickstartGuide();
+    void OnOpenFAQ();
     /// Called whenever a user selects a game in the game list widget.
     void OnGameListLoadFile(QString game_path);
     void OnGameListOpenFolder(GameListOpenTarget target, const std::string& game_path);
@@ -215,10 +218,13 @@ private slots:
 
 private:
     std::optional<u64> SelectRomFSDumpTarget(const FileSys::ContentProvider&, u64 program_id);
-    void UpdateWindowTitle(const QString& title_name = {});
+    void UpdateWindowTitle(const std::string& title_name = {},
+                           const std::string& title_version = {});
     void UpdateStatusBar();
+    void UpdateStatusButtons();
     void HideMouseCursor();
     void ShowMouseCursor();
+    void OpenURL(const QUrl& url);
 
     Ui::MainWindow ui;
 
@@ -234,6 +240,7 @@ private:
     QLabel* game_fps_label = nullptr;
     QLabel* emu_frametime_label = nullptr;
     QPushButton* async_status_button = nullptr;
+    QPushButton* multicore_status_button = nullptr;
     QPushButton* renderer_status_button = nullptr;
     QPushButton* dock_status_button = nullptr;
     QTimer status_bar_update_timer;
diff --git a/src/yuzu/main.ui b/src/yuzu/main.ui
index 97c90f50b..bee6e107e 100644
--- a/src/yuzu/main.ui
+++ b/src/yuzu/main.ui
@@ -113,6 +113,9 @@
      <string>&amp;Help</string>
     </property>
     <addaction name="action_Report_Compatibility"/>
+    <addaction name="action_Open_Mods_Page"/>
+    <addaction name="action_Open_Quickstart_Guide"/>
+    <addaction name="action_Open_FAQ"/>
     <addaction name="separator"/>
     <addaction name="action_About"/>
    </widget>
@@ -256,6 +259,21 @@
     <bool>false</bool>
    </property>
   </action>
+  <action name="action_Open_Mods_Page">
+   <property name="text">
+    <string>Open Mods Page</string>
+   </property>
+  </action>
+  <action name="action_Open_Quickstart_Guide">
+   <property name="text">
+    <string>Open Quickstart Guide</string>
+   </property>
+  </action>
+  <action name="action_Open_FAQ">
+   <property name="text">
+    <string>FAQ</string>
+   </property>
+  </action>
   <action name="action_Open_yuzu_Folder">
    <property name="text">
     <string>Open yuzu Folder</string>
diff --git a/src/yuzu/yuzu.rc b/src/yuzu/yuzu.rc
index 1b253653f..4a3645a71 100644
--- a/src/yuzu/yuzu.rc
+++ b/src/yuzu/yuzu.rc
@@ -16,4 +16,4 @@ IDI_ICON1               ICON                    "../../dist/yuzu.ico"
 // RT_MANIFEST
 //
 
-1                       RT_MANIFEST             "../../dist/yuzu.manifest"
+0                       RT_MANIFEST             "../../dist/yuzu.manifest"
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index c20d48c42..23763144f 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -354,65 +354,72 @@ void Config::ReadValues() {
 
     const auto rng_seed_enabled = sdl2_config->GetBoolean("System", "rng_seed_enabled", false);
     if (rng_seed_enabled) {
-        Settings::values.rng_seed = sdl2_config->GetInteger("System", "rng_seed", 0);
+        Settings::values.rng_seed.SetValue(sdl2_config->GetInteger("System", "rng_seed", 0));
     } else {
-        Settings::values.rng_seed = std::nullopt;
+        Settings::values.rng_seed.SetValue(std::nullopt);
     }
 
     const auto custom_rtc_enabled = sdl2_config->GetBoolean("System", "custom_rtc_enabled", false);
     if (custom_rtc_enabled) {
-        Settings::values.custom_rtc =
-            std::chrono::seconds(sdl2_config->GetInteger("System", "custom_rtc", 0));
+        Settings::values.custom_rtc.SetValue(
+            std::chrono::seconds(sdl2_config->GetInteger("System", "custom_rtc", 0)));
     } else {
-        Settings::values.custom_rtc = std::nullopt;
+        Settings::values.custom_rtc.SetValue(std::nullopt);
     }
 
-    Settings::values.language_index = sdl2_config->GetInteger("System", "language_index", 1);
-    Settings::values.time_zone_index = sdl2_config->GetInteger("System", "time_zone_index", 0);
+    Settings::values.language_index.SetValue(
+        sdl2_config->GetInteger("System", "language_index", 1));
+    Settings::values.time_zone_index.SetValue(
+        sdl2_config->GetInteger("System", "time_zone_index", 0));
 
     // Core
-    Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false);
+    Settings::values.use_multi_core.SetValue(
+        sdl2_config->GetBoolean("Core", "use_multi_core", false));
 
     // Renderer
     const int renderer_backend = sdl2_config->GetInteger(
         "Renderer", "backend", static_cast<int>(Settings::RendererBackend::OpenGL));
-    Settings::values.renderer_backend = static_cast<Settings::RendererBackend>(renderer_backend);
+    Settings::values.renderer_backend.SetValue(
+        static_cast<Settings::RendererBackend>(renderer_backend));
     Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false);
-    Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0);
-
-    Settings::values.resolution_factor =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
-    Settings::values.aspect_ratio =
-        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
-    Settings::values.max_anisotropy =
-        static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0));
-    Settings::values.use_frame_limit = sdl2_config->GetBoolean("Renderer", "use_frame_limit", true);
-    Settings::values.frame_limit =
-        static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100));
-    Settings::values.use_disk_shader_cache =
-        sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false);
+    Settings::values.vulkan_device.SetValue(
+        sdl2_config->GetInteger("Renderer", "vulkan_device", 0));
+
+    Settings::values.aspect_ratio.SetValue(
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)));
+    Settings::values.max_anisotropy.SetValue(
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0)));
+    Settings::values.use_frame_limit.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_frame_limit", true));
+    Settings::values.frame_limit.SetValue(
+        static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100)));
+    Settings::values.use_disk_shader_cache.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false));
     const int gpu_accuracy_level = sdl2_config->GetInteger("Renderer", "gpu_accuracy", 0);
-    Settings::values.gpu_accuracy = static_cast<Settings::GPUAccuracy>(gpu_accuracy_level);
-    Settings::values.use_asynchronous_gpu_emulation =
-        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
-    Settings::values.use_vsync =
-        static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
-    Settings::values.use_assembly_shaders =
-        sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
-    Settings::values.use_fast_gpu_time =
-        sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
-
-    Settings::values.bg_red = static_cast<float>(sdl2_config->GetReal("Renderer", "bg_red", 0.0));
-    Settings::values.bg_green =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_green", 0.0));
-    Settings::values.bg_blue = static_cast<float>(sdl2_config->GetReal("Renderer", "bg_blue", 0.0));
+    Settings::values.gpu_accuracy.SetValue(static_cast<Settings::GPUAccuracy>(gpu_accuracy_level));
+    Settings::values.use_asynchronous_gpu_emulation.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false));
+    Settings::values.use_vsync.SetValue(
+        static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1)));
+    Settings::values.use_assembly_shaders.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false));
+    Settings::values.use_fast_gpu_time.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true));
+
+    Settings::values.bg_red.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_red", 0.0)));
+    Settings::values.bg_green.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_green", 0.0)));
+    Settings::values.bg_blue.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_blue", 0.0)));
 
     // Audio
     Settings::values.sink_id = sdl2_config->Get("Audio", "output_engine", "auto");
-    Settings::values.enable_audio_stretching =
-        sdl2_config->GetBoolean("Audio", "enable_audio_stretching", true);
+    Settings::values.enable_audio_stretching.SetValue(
+        sdl2_config->GetBoolean("Audio", "enable_audio_stretching", true));
     Settings::values.audio_device_id = sdl2_config->Get("Audio", "output_device", "auto");
-    Settings::values.volume = static_cast<float>(sdl2_config->GetReal("Audio", "volume", 1));
+    Settings::values.volume.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Audio", "volume", 1)));
 
     // Miscellaneous
     Settings::values.log_filter = sdl2_config->Get("Miscellaneous", "log_filter", "*:Trace");
@@ -432,6 +439,8 @@ void Config::ReadValues() {
     Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
     Settings::values.disable_cpu_opt =
         sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
+    Settings::values.disable_macro_jit =
+        sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);
 
     const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
     std::stringstream ss(title_list);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index abc6e6e65..45c07ed5d 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -117,11 +117,6 @@ use_hw_renderer =
 # 0: Interpreter (slow), 1 (default): JIT (fast)
 use_shader_jit =
 
-# Resolution scale factor
-# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
-# factor for the Switch resolution
-resolution_factor =
-
 # Aspect ratio
 # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
 aspect_ratio =
@@ -291,6 +286,8 @@ quest_flag =
 # Determines whether or not JIT CPU optimizations are enabled
 # false: Optimizations Enabled, true: Optimizations Disabled
 disable_cpu_opt =
+# Enables/Disables the macro JIT compiler
+disable_macro_jit=false
 
 [WebService]
 # Whether or not to enable telemetry
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
index 411e7e647..e78025737 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
     SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
     SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
     SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
+    if (Settings::values.renderer_debug) {
+        SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+    }
     SDL_GL_SetSwapInterval(0);
 
     std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,
@@ -162,7 +165,7 @@ std::unique_ptr<Core::Frontend::GraphicsContext> EmuWindow_SDL2_GL::CreateShared
 
 void EmuWindow_SDL2_GL::Present() {
     SDL_GL_MakeCurrent(render_window, window_context);
-    SDL_GL_SetSwapInterval(Settings::values.use_vsync ? 1 : 0);
+    SDL_GL_SetSwapInterval(Settings::values.use_vsync.GetValue() ? 1 : 0);
     while (IsOpen()) {
         system.Renderer().TryPresent(100);
         SDL_GL_SwapWindow(render_window);
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index 4d2ea7e9e..512b060a7 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -180,7 +181,7 @@ int main(int argc, char** argv) {
     Core::System& system{Core::System::GetInstance()};
 
     std::unique_ptr<EmuWindow_SDL2> emu_window;
-    switch (Settings::values.renderer_backend) {
+    switch (Settings::values.renderer_backend.GetValue()) {
     case Settings::RendererBackend::OpenGL:
         emu_window = std::make_unique<EmuWindow_SDL2_GL>(system, fullscreen);
         break;
@@ -236,9 +237,11 @@ int main(int argc, char** argv) {
     system.Renderer().Rasterizer().LoadDiskResources();
 
     std::thread render_thread([&emu_window] { emu_window->Present(); });
+    system.Run();
     while (emu_window->IsOpen()) {
-        system.RunLoop();
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
+    system.Pause();
     render_thread.join();
 
     system.Shutdown();
diff --git a/src/yuzu_cmd/yuzu.rc b/src/yuzu_cmd/yuzu.rc
index 7de8ef3d9..0cde75e2f 100644
--- a/src/yuzu_cmd/yuzu.rc
+++ b/src/yuzu_cmd/yuzu.rc
@@ -14,4 +14,4 @@ YUZU_ICON               ICON                    "../../dist/yuzu.ico"
 // RT_MANIFEST
 //
 
-1                       RT_MANIFEST             "../../dist/yuzu.manifest"
+0                       RT_MANIFEST             "../../dist/yuzu.manifest"
diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp
index 3be58b15d..acb22885e 100644
--- a/src/yuzu_tester/config.cpp
+++ b/src/yuzu_tester/config.cpp
@@ -81,6 +81,9 @@ void Config::ReadValues() {
     Settings::values.touchscreen.diameter_x = 15;
     Settings::values.touchscreen.diameter_y = 15;
 
+    Settings::values.use_docked_mode =
+        sdl2_config->GetBoolean("Controls", "use_docked_mode", false);
+
     // Data Storage
     Settings::values.use_virtual_sd =
         sdl2_config->GetBoolean("Data Storage", "use_virtual_sd", true);
@@ -92,59 +95,59 @@ void Config::ReadValues() {
                                            FileUtil::GetUserPath(FileUtil::UserPath::SDMCDir)));
 
     // System
-    Settings::values.use_docked_mode = sdl2_config->GetBoolean("System", "use_docked_mode", false);
-
     Settings::values.current_user = std::clamp<int>(
         sdl2_config->GetInteger("System", "current_user", 0), 0, Service::Account::MAX_USERS - 1);
 
     const auto rng_seed_enabled = sdl2_config->GetBoolean("System", "rng_seed_enabled", false);
     if (rng_seed_enabled) {
-        Settings::values.rng_seed = sdl2_config->GetInteger("System", "rng_seed", 0);
+        Settings::values.rng_seed.SetValue(sdl2_config->GetInteger("System", "rng_seed", 0));
     } else {
-        Settings::values.rng_seed = std::nullopt;
+        Settings::values.rng_seed.SetValue(std::nullopt);
     }
 
     const auto custom_rtc_enabled = sdl2_config->GetBoolean("System", "custom_rtc_enabled", false);
     if (custom_rtc_enabled) {
-        Settings::values.custom_rtc =
-            std::chrono::seconds(sdl2_config->GetInteger("System", "custom_rtc", 0));
+        Settings::values.custom_rtc.SetValue(
+            std::chrono::seconds(sdl2_config->GetInteger("System", "custom_rtc", 0)));
     } else {
-        Settings::values.custom_rtc = std::nullopt;
+        Settings::values.custom_rtc.SetValue(std::nullopt);
     }
 
     // Core
-    Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false);
+    Settings::values.use_multi_core.SetValue(
+        sdl2_config->GetBoolean("Core", "use_multi_core", false));
 
     // Renderer
-    Settings::values.resolution_factor =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
-    Settings::values.aspect_ratio =
-        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
-    Settings::values.max_anisotropy =
-        static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0));
-    Settings::values.use_frame_limit = false;
-    Settings::values.frame_limit = 100;
-    Settings::values.use_disk_shader_cache =
-        sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false);
+    Settings::values.aspect_ratio.SetValue(
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)));
+    Settings::values.max_anisotropy.SetValue(
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0)));
+    Settings::values.use_frame_limit.SetValue(false);
+    Settings::values.frame_limit.SetValue(100);
+    Settings::values.use_disk_shader_cache.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false));
     const int gpu_accuracy_level = sdl2_config->GetInteger("Renderer", "gpu_accuracy", 0);
-    Settings::values.gpu_accuracy = static_cast<Settings::GPUAccuracy>(gpu_accuracy_level);
-    Settings::values.use_asynchronous_gpu_emulation =
-        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
-    Settings::values.use_fast_gpu_time =
-        sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
-
-    Settings::values.bg_red = static_cast<float>(sdl2_config->GetReal("Renderer", "bg_red", 0.0));
-    Settings::values.bg_green =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_green", 0.0));
-    Settings::values.bg_blue = static_cast<float>(sdl2_config->GetReal("Renderer", "bg_blue", 0.0));
+    Settings::values.gpu_accuracy.SetValue(static_cast<Settings::GPUAccuracy>(gpu_accuracy_level));
+    Settings::values.use_asynchronous_gpu_emulation.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false));
+    Settings::values.use_fast_gpu_time.SetValue(
+        sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true));
+
+    Settings::values.bg_red.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_red", 0.0)));
+    Settings::values.bg_green.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_green", 0.0)));
+    Settings::values.bg_blue.SetValue(
+        static_cast<float>(sdl2_config->GetReal("Renderer", "bg_blue", 0.0)));
 
     // Audio
     Settings::values.sink_id = "null";
-    Settings::values.enable_audio_stretching = false;
+    Settings::values.enable_audio_stretching.SetValue(false);
     Settings::values.audio_device_id = "auto";
-    Settings::values.volume = 0;
+    Settings::values.volume.SetValue(0);
 
-    Settings::values.language_index = sdl2_config->GetInteger("System", "language_index", 1);
+    Settings::values.language_index.SetValue(
+        sdl2_config->GetInteger("System", "language_index", 1));
 
     // Miscellaneous
     Settings::values.log_filter = sdl2_config->Get("Miscellaneous", "log_filter", "*:Trace");
diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h
index ca203b64d..41bbbbf60 100644
--- a/src/yuzu_tester/default_ini.h
+++ b/src/yuzu_tester/default_ini.h
@@ -21,11 +21,6 @@ use_hw_renderer =
 # 0: Interpreter (slow), 1 (default): JIT (fast)
 use_shader_jit =
 
-# Resolution scale factor
-# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
-# factor for the Switch resolution
-resolution_factor =
-
 # Aspect ratio
 # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
 aspect_ratio =
diff --git a/src/yuzu_tester/service/yuzutest.cpp b/src/yuzu_tester/service/yuzutest.cpp
index 85d3f436b..2d3f6e3a7 100644
--- a/src/yuzu_tester/service/yuzutest.cpp
+++ b/src/yuzu_tester/service/yuzutest.cpp
@@ -53,7 +53,7 @@ private:
 
         IPC::ResponseBuilder rb{ctx, 3};
         rb.Push(RESULT_SUCCESS);
-        rb.Push<u32>(write_size);
+        rb.Push<u32>(static_cast<u32>(write_size));
     }
 
     void StartIndividual(Kernel::HLERequestContext& ctx) {
diff --git a/src/yuzu_tester/yuzu.cpp b/src/yuzu_tester/yuzu.cpp
index 676e70ebd..083667baf 100644
--- a/src/yuzu_tester/yuzu.cpp
+++ b/src/yuzu_tester/yuzu.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -255,9 +256,11 @@ int main(int argc, char** argv) {
     system.GPU().Start();
     system.Renderer().Rasterizer().LoadDiskResources();
 
+    system.Run();
     while (!finished) {
-        system.RunLoop();
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
+    system.Pause();
 
     detached_tasks.WaitForAllTasks();
     return return_value;
diff --git a/src/yuzu_tester/yuzu.rc b/src/yuzu_tester/yuzu.rc
index 7de8ef3d9..0cde75e2f 100644
--- a/src/yuzu_tester/yuzu.rc
+++ b/src/yuzu_tester/yuzu.rc
@@ -14,4 +14,4 @@ YUZU_ICON               ICON                    "../../dist/yuzu.ico"
 // RT_MANIFEST
 //
 
-1                       RT_MANIFEST             "../../dist/yuzu.manifest"
+0                       RT_MANIFEST             "../../dist/yuzu.manifest"