summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/CMakeLists.txt11
-rw-r--r--src/video_core/dma_pusher.cpp1
-rw-r--r--src/video_core/engines/fermi_2d.cpp3
-rw-r--r--src/video_core/engines/fermi_2d.h8
-rw-r--r--src/video_core/engines/kepler_compute.h10
-rw-r--r--src/video_core/engines/kepler_memory.cpp4
-rw-r--r--src/video_core/engines/kepler_memory.h7
-rw-r--r--src/video_core/engines/maxwell_3d.cpp19
-rw-r--r--src/video_core/engines/maxwell_3d.h24
-rw-r--r--src/video_core/engines/maxwell_dma.cpp4
-rw-r--r--src/video_core/engines/maxwell_dma.h9
-rw-r--r--src/video_core/engines/shader_bytecode.h11
-rw-r--r--src/video_core/gpu.cpp2
-rw-r--r--src/video_core/gpu_asynch.cpp2
-rw-r--r--src/video_core/gpu_thread.cpp43
-rw-r--r--src/video_core/gpu_thread.h71
-rw-r--r--src/video_core/macro_interpreter.cpp20
-rw-r--r--src/video_core/memory_manager.cpp138
-rw-r--r--src/video_core/memory_manager.h16
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_global_cache.cpp44
-rw-r--r--src/video_core/renderer_opengl/gl_global_cache.h16
-rw-r--r--src/video_core/renderer_opengl/gl_primitive_assembler.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_primitive_assembler.h2
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp63
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h13
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.cpp44
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.h11
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp7
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h5
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp208
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h16
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.cpp59
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.cpp1
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.h3
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.cpp51
-rw-r--r--src/video_core/renderer_opengl/gl_shader_manager.h73
-rw-r--r--src/video_core/renderer_opengl/gl_state.cpp610
-rw-r--r--src/video_core/renderer_opengl/gl_state.h52
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp1
-rw-r--r--src/video_core/renderer_opengl/utils.cpp28
-rw-r--r--src/video_core/renderer_opengl/utils.h20
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp1
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp1381
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h80
-rw-r--r--src/video_core/shader/decode/memory.cpp109
-rw-r--r--src/video_core/shader/decode/texture.cpp114
-rw-r--r--src/video_core/shader/decode/xmad.cpp39
-rw-r--r--src/video_core/shader/shader_ir.h28
-rw-r--r--src/video_core/shader/track.cpp17
-rw-r--r--src/video_core/surface.cpp2
-rw-r--r--src/video_core/texture_cache.cpp386
-rw-r--r--src/video_core/texture_cache.h586
-rw-r--r--src/video_core/textures/convert.cpp1
-rw-r--r--src/video_core/textures/convert.h5
-rw-r--r--src/video_core/textures/texture.h2
56 files changed, 3682 insertions, 803 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 44c761d3e..114bed20d 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -106,6 +106,8 @@ add_library(video_core STATIC
textures/decoders.cpp
textures/decoders.h
textures/texture.h
+ texture_cache.cpp
+ texture_cache.h
video_core.cpp
video_core.h
)
@@ -127,16 +129,21 @@ if (ENABLE_VULKAN)
renderer_vulkan/vk_sampler_cache.h
renderer_vulkan/vk_scheduler.cpp
renderer_vulkan/vk_scheduler.h
+ renderer_vulkan/vk_shader_decompiler.cpp
+ renderer_vulkan/vk_shader_decompiler.h
renderer_vulkan/vk_stream_buffer.cpp
renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h)
- target_include_directories(video_core PRIVATE ../../externals/Vulkan-Headers/include)
+ target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
target_compile_definitions(video_core PRIVATE HAS_VULKAN)
endif()
create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad lz4_static)
+target_link_libraries(video_core PRIVATE glad)
+if (ENABLE_VULKAN)
+ target_link_libraries(video_core PRIVATE sirit)
+endif()
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 8b1bea1ae..046d047cb 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -8,6 +8,7 @@
#include "video_core/dma_pusher.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
namespace Tegra {
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 03b7ee5d8..55966eef1 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,12 +6,13 @@
#include "common/logging/log.h"
#include "common/math_util.h"
#include "video_core/engines/fermi_2d.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace Tegra::Engines {
Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
- : memory_manager(memory_manager), rasterizer{rasterizer} {}
+ : rasterizer{rasterizer}, memory_manager{memory_manager} {}
void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 80523e320..2e51b7f13 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -10,7 +10,10 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
+
+namespace Tegra {
+class MemoryManager;
+}
namespace VideoCore {
class RasterizerInterface;
@@ -115,10 +118,9 @@ public:
};
} regs{};
- MemoryManager& memory_manager;
-
private:
VideoCore::RasterizerInterface& rasterizer;
+ MemoryManager& memory_manager;
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 6575afd0f..fb6cdf432 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -9,7 +9,10 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
+
+namespace Tegra {
+class MemoryManager;
+}
namespace Tegra::Engines {
@@ -40,10 +43,11 @@ public:
static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
"KeplerCompute Regs has wrong size");
- MemoryManager& memory_manager;
-
/// Write the value to the register identified by method.
void CallMethod(const GPU::MethodCall& method_call);
+
+private:
+ MemoryManager& memory_manager;
};
#define ASSERT_REG_POSITION(field_name, position) \
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index e259bf46b..cd51a31d7 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -5,9 +5,9 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
-#include "core/memory.h"
#include "video_core/engines/kepler_memory.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
@@ -15,7 +15,7 @@ namespace Tegra::Engines {
KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
MemoryManager& memory_manager)
- : system{system}, memory_manager(memory_manager), rasterizer{rasterizer} {}
+ : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
KeplerMemory::~KeplerMemory() = default;
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 9181e9d80..78b6c3e45 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -10,12 +10,15 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
namespace Core {
class System;
}
+namespace Tegra {
+class MemoryManager;
+}
+
namespace VideoCore {
class RasterizerInterface;
}
@@ -82,8 +85,8 @@ public:
private:
Core::System& system;
- MemoryManager& memory_manager;
VideoCore::RasterizerInterface& rasterizer;
+ MemoryManager& memory_manager;
void ProcessData(u32 data);
};
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index defcfbd3f..74403eed4 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -7,11 +7,10 @@
#include "common/assert.h"
#include "core/core.h"
#include "core/core_timing.h"
-#include "core/memory.h"
#include "video_core/debug_utils/debug_utils.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
-#include "video_core/renderer_base.h"
#include "video_core/textures/texture.h"
namespace Tegra::Engines {
@@ -21,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
MemoryManager& memory_manager)
- : memory_manager(memory_manager), system{system}, rasterizer{rasterizer},
- macro_interpreter(*this) {
+ : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
+ *this} {
InitializeRegisterDefaults();
}
@@ -250,6 +249,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
ProcessQueryGet();
break;
}
+ case MAXWELL3D_REG_INDEX(sync_info): {
+ ProcessSyncPoint();
+ break;
+ }
default:
break;
}
@@ -327,6 +330,14 @@ void Maxwell3D::ProcessQueryGet() {
}
}
+void Maxwell3D::ProcessSyncPoint() {
+ const u32 sync_point = regs.sync_info.sync_point.Value();
+ const u32 increment = regs.sync_info.increment.Value();
+ const u32 cache_flush = regs.sync_info.unknown.Value();
+ LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
+ cache_flush);
+}
+
void Maxwell3D::DrawArrays() {
LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
regs.vertex_buffer.count);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 7fbf1026e..321af3297 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -16,13 +16,16 @@
#include "common/math_util.h"
#include "video_core/gpu.h"
#include "video_core/macro_interpreter.h"
-#include "video_core/memory_manager.h"
#include "video_core/textures/texture.h"
namespace Core {
class System;
}
+namespace Tegra {
+class MemoryManager;
+}
+
namespace VideoCore {
class RasterizerInterface;
}
@@ -576,7 +579,17 @@ public:
u32 bind;
} macros;
- INSERT_PADDING_WORDS(0x188);
+ INSERT_PADDING_WORDS(0x69);
+
+ struct {
+ union {
+ BitField<0, 16, u32> sync_point;
+ BitField<16, 1, u32> unknown;
+ BitField<20, 1, u32> increment;
+ };
+ } sync_info;
+
+ INSERT_PADDING_WORDS(0x11E);
u32 tfb_enabled;
@@ -1093,7 +1106,6 @@ public:
};
State state{};
- MemoryManager& memory_manager;
struct DirtyFlags {
std::bitset<8> color_buffer{0xFF};
@@ -1141,6 +1153,8 @@ private:
VideoCore::RasterizerInterface& rasterizer;
+ MemoryManager& memory_manager;
+
/// Start offsets of each macro in macro_memory
std::unordered_map<u32, u32> macro_offsets;
@@ -1180,6 +1194,9 @@ private:
/// Handles a write to the QUERY_GET register.
void ProcessQueryGet();
+ /// Handles writes to syncing register.
+ void ProcessSyncPoint();
+
/// Handles a write to the CB_DATA[i] register.
void ProcessCBData(u32 value);
@@ -1195,6 +1212,7 @@ private:
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(sync_info, 0xB2);
ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
ASSERT_REG_POSITION(rt, 0x200);
ASSERT_REG_POSITION(viewport_transform, 0x280);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 5cca5c29a..2426d0067 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,9 +5,9 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
-#include "core/memory.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
@@ -16,7 +16,7 @@ namespace Tegra::Engines {
MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
MemoryManager& memory_manager)
- : memory_manager(memory_manager), system{system}, rasterizer{rasterizer} {}
+ : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 34c369320..c6b649842 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -10,12 +10,15 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
namespace Core {
class System;
}
+namespace Tegra {
+class MemoryManager;
+}
+
namespace VideoCore {
class RasterizerInterface;
}
@@ -139,13 +142,13 @@ public:
};
} regs{};
- MemoryManager& memory_manager;
-
private:
Core::System& system;
VideoCore::RasterizerInterface& rasterizer;
+ MemoryManager& memory_manager;
+
/// Performs the copy from the source buffer to the destination buffer as configured in the
/// registers.
void HandleCopy();
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index c41e3f4f0..38db4addd 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -806,6 +806,12 @@ union Instruction {
} ldg;
union {
+ BitField<48, 3, UniformType> type;
+ BitField<46, 2, u64> cache_mode;
+ BitField<20, 24, s64> immediate_offset;
+ } stg;
+
+ union {
BitField<0, 3, u64> pred0;
BitField<3, 3, u64> pred3;
BitField<7, 1, u64> abs_a;
@@ -1252,13 +1258,16 @@ union Instruction {
union {
BitField<20, 16, u64> imm20_16;
+ BitField<35, 1, u64> high_b_rr; // used on RR
BitField<36, 1, u64> product_shift_left;
BitField<37, 1, u64> merge_37;
BitField<48, 1, u64> sign_a;
BitField<49, 1, u64> sign_b;
+ BitField<50, 2, XmadMode> mode_cbf; // used by CR, RC
BitField<50, 3, XmadMode> mode;
BitField<52, 1, u64> high_b;
BitField<53, 1, u64> high_a;
+ BitField<55, 1, u64> product_shift_left_second; // used on CR
BitField<56, 1, u64> merge_56;
} xmad;
@@ -1676,7 +1685,7 @@ private:
INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"),
INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
- INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
+ INST("0011100-01000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"),
INST("0000001---------", Id::LOP3_C, Type::ArithmeticInteger, "LOP3_C"),
INST("0101101111100---", Id::LOP3_R, Type::ArithmeticInteger, "LOP3_R"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 30b29e14d..4461083ff 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
auto& rasterizer{renderer.Rasterizer()};
- memory_manager = std::make_unique<Tegra::MemoryManager>();
+ memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 8b355cf7b..db507cf04 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -9,7 +9,7 @@
namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
- : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {}
+ : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {}
GPUAsynch::~GPUAsynch() = default;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c5dc199c5..cc56cf467 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,9 @@
#include "common/assert.h"
#include "common/microprofile.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
#include "core/frontend/scope_acquire_window_context.h"
#include "video_core/dma_pusher.h"
#include "video_core/gpu.h"
@@ -36,7 +39,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
} else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
- state.DecrementFramesCounter();
renderer.SwapBuffers(std::move(data->framebuffer));
} else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(data->addr, data->size);
@@ -47,13 +49,18 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
} else {
UNREACHABLE();
}
+ state.signaled_fence = next.fence;
+ state.TrySynchronize();
}
}
}
-ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
- : renderer{renderer}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher),
- std::ref(state)} {}
+ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
+ Tegra::DmaPusher& dma_pusher)
+ : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} {
+ synchronization_event = system.CoreTiming().RegisterEvent(
+ "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
+}
ThreadManager::~ThreadManager() {
// Notify GPU thread that a shutdown is pending
@@ -62,14 +69,14 @@ ThreadManager::~ThreadManager() {
}
void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
- PushCommand(SubmitListCommand(std::move(entries)));
+ const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
+ const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
+ system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
}
void ThreadManager::SwapBuffers(
std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
- state.IncrementFramesCounter();
PushCommand(SwapBuffersCommand(std::move(framebuffer)));
- state.WaitForFrames();
}
void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
@@ -79,7 +86,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
if (state.queue.Empty()) {
// It's quicker to invalidate a single region on the CPU if the queue is already empty
- renderer.Rasterizer().InvalidateRegion(addr, size);
+ system.Renderer().Rasterizer().InvalidateRegion(addr, size);
} else {
PushCommand(InvalidateRegionCommand(addr, size));
}
@@ -90,9 +97,25 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
InvalidateRegion(addr, size);
}
-void ThreadManager::PushCommand(CommandData&& command_data) {
- state.queue.Push(CommandDataContainer(std::move(command_data)));
+u64 ThreadManager::PushCommand(CommandData&& command_data) {
+ const u64 fence{++state.last_fence};
+ state.queue.Push(CommandDataContainer(std::move(command_data), fence));
state.SignalCommands();
+ return fence;
+}
+
+MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
+void SynchState::WaitForSynchronization(u64 fence) {
+ if (signaled_fence >= fence) {
+ return;
+ }
+
+ // Wait for the GPU to be idle (all commands to be executed)
+ {
+ MICROPROFILE_SCOPE(GPU_wait);
+ std::unique_lock<std::mutex> lock{synchronization_mutex};
+ synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
+ }
}
} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 70acb2e79..62bcea5bb 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -19,9 +19,12 @@ struct FramebufferConfig;
class DmaPusher;
} // namespace Tegra
-namespace VideoCore {
-class RendererBase;
-} // namespace VideoCore
+namespace Core {
+class System;
+namespace Timing {
+struct EventType;
+} // namespace Timing
+} // namespace Core
namespace VideoCommon::GPUThread {
@@ -75,63 +78,47 @@ using CommandData =
struct CommandDataContainer {
CommandDataContainer() = default;
- CommandDataContainer(CommandData&& data) : data{std::move(data)} {}
+ CommandDataContainer(CommandData&& data, u64 next_fence)
+ : data{std::move(data)}, fence{next_fence} {}
CommandDataContainer& operator=(const CommandDataContainer& t) {
data = std::move(t.data);
+ fence = t.fence;
return *this;
}
CommandData data;
+ u64 fence{};
};
/// Struct used to synchronize the GPU thread
struct SynchState final {
std::atomic_bool is_running{true};
std::atomic_int queued_frame_count{};
- std::mutex frames_mutex;
+ std::mutex synchronization_mutex;
std::mutex commands_mutex;
std::condition_variable commands_condition;
- std::condition_variable frames_condition;
+ std::condition_variable synchronization_condition;
- void IncrementFramesCounter() {
- std::lock_guard lock{frames_mutex};
- ++queued_frame_count;
+ /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
+ /// synchronized. This is entirely empirical.
+ bool IsSynchronized() const {
+ constexpr std::size_t max_queue_gap{5};
+ return queue.Size() <= max_queue_gap;
}
- void DecrementFramesCounter() {
- {
- std::lock_guard lock{frames_mutex};
- --queued_frame_count;
-
- if (queued_frame_count) {
- return;
- }
+ void TrySynchronize() {
+ if (IsSynchronized()) {
+ std::lock_guard<std::mutex> lock{synchronization_mutex};
+ synchronization_condition.notify_one();
}
- frames_condition.notify_one();
}
- void WaitForFrames() {
- {
- std::lock_guard lock{frames_mutex};
- if (!queued_frame_count) {
- return;
- }
- }
-
- // Wait for the GPU to be idle (all commands to be executed)
- {
- std::unique_lock lock{frames_mutex};
- frames_condition.wait(lock, [this] { return !queued_frame_count; });
- }
- }
+ void WaitForSynchronization(u64 fence);
void SignalCommands() {
- {
- std::unique_lock lock{commands_mutex};
- if (queue.Empty()) {
- return;
- }
+ if (queue.Empty()) {
+ return;
}
commands_condition.notify_one();
@@ -144,12 +131,15 @@ struct SynchState final {
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
CommandQueue queue;
+ u64 last_fence{};
+ std::atomic<u64> signaled_fence{};
};
/// Class used to manage the GPU thread
class ThreadManager final {
public:
- explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+ explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
+ Tegra::DmaPusher& dma_pusher);
~ThreadManager();
/// Push GPU command entries to be processed
@@ -170,11 +160,12 @@ public:
private:
/// Pushes a command to be executed by the GPU thread
- void PushCommand(CommandData&& command_data);
+ u64 PushCommand(CommandData&& command_data);
private:
SynchState state;
- VideoCore::RendererBase& renderer;
+ Core::System& system;
+ Core::Timing::EventType* synchronization_event{};
std::thread thread;
std::thread::id thread_id;
};
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 64f75db43..524d9ea5a 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -223,27 +223,21 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
}
u32 MacroInterpreter::FetchParameter() {
- ASSERT(next_parameter_index < parameters.size());
- return parameters[next_parameter_index++];
+ return parameters.at(next_parameter_index++);
}
u32 MacroInterpreter::GetRegister(u32 register_id) const {
- // Register 0 is supposed to always return 0.
- if (register_id == 0)
- return 0;
-
- ASSERT(register_id < registers.size());
- return registers[register_id];
+ return registers.at(register_id);
}
void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
- // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
- // register.
- if (register_id == 0)
+ // Register 0 is hardwired as the zero register.
+ // Ensure no writes to it actually occur.
+ if (register_id == 0) {
return;
+ }
- ASSERT(register_id < registers.size());
- registers[register_id] = value;
+ registers.at(register_id) = value;
}
void MacroInterpreter::SetMethodAddress(u32 address) {
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index e76b59842..0f4e820aa 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,16 +5,13 @@
#include "common/alignment.h"
#include "common/assert.h"
#include "common/logging/log.h"
-#include "core/core.h"
#include "core/memory.h"
-#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
-#include "video_core/renderer_base.h"
namespace Tegra {
-MemoryManager::MemoryManager() {
+MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
std::fill(page_table.attributes.begin(), page_table.attributes.end(),
Common::PageType::Unmapped);
@@ -70,23 +67,23 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
const u64 aligned_size{Common::AlignUp(size, page_size)};
const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
- Core::System::GetInstance().Renderer().Rasterizer().FlushAndInvalidateRegion(cache_addr,
- aligned_size);
+ rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
UnmapRange(gpu_addr, aligned_size);
return gpu_addr;
}
-GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) {
+GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) const {
// Find the first Free VMA.
- const VMAHandle vma_handle{std::find_if(vma_map.begin(), vma_map.end(), [&](const auto& vma) {
- if (vma.second.type != VirtualMemoryArea::Type::Unmapped) {
- return false;
- }
+ const VMAHandle vma_handle{
+ std::find_if(vma_map.begin(), vma_map.end(), [region_start, size](const auto& vma) {
+ if (vma.second.type != VirtualMemoryArea::Type::Unmapped) {
+ return false;
+ }
- const VAddr vma_end{vma.second.base + vma.second.size};
- return vma_end > region_start && vma_end >= region_start + size;
- })};
+ const VAddr vma_end{vma.second.base + vma.second.size};
+ return vma_end > region_start && vma_end >= region_start + size;
+ })};
if (vma_handle == vma_map.end()) {
return {};
@@ -99,12 +96,12 @@ bool MemoryManager::IsAddressValid(GPUVAddr addr) const {
return (addr >> page_bits) < page_table.pointers.size();
}
-std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) {
+std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) const {
if (!IsAddressValid(addr)) {
return {};
}
- VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]};
+ const VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]};
if (cpu_addr) {
return cpu_addr + (addr & page_mask);
}
@@ -113,7 +110,7 @@ std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) {
}
template <typename T>
-T MemoryManager::Read(GPUVAddr addr) {
+T MemoryManager::Read(GPUVAddr addr) const {
if (!IsAddressValid(addr)) {
return {};
}
@@ -165,10 +162,10 @@ void MemoryManager::Write(GPUVAddr addr, T data) {
}
}
-template u8 MemoryManager::Read<u8>(GPUVAddr addr);
-template u16 MemoryManager::Read<u16>(GPUVAddr addr);
-template u32 MemoryManager::Read<u32>(GPUVAddr addr);
-template u64 MemoryManager::Read<u64>(GPUVAddr addr);
+template u8 MemoryManager::Read<u8>(GPUVAddr addr) const;
+template u16 MemoryManager::Read<u16>(GPUVAddr addr) const;
+template u32 MemoryManager::Read<u32>(GPUVAddr addr) const;
+template u64 MemoryManager::Read<u64>(GPUVAddr addr) const;
template void MemoryManager::Write<u8>(GPUVAddr addr, u8 data);
template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data);
template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data);
@@ -179,8 +176,22 @@ u8* MemoryManager::GetPointer(GPUVAddr addr) {
return {};
}
- u8* page_pointer{page_table.pointers[addr >> page_bits]};
- if (page_pointer) {
+ u8* const page_pointer{page_table.pointers[addr >> page_bits]};
+ if (page_pointer != nullptr) {
+ return page_pointer + (addr & page_mask);
+ }
+
+ LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr);
+ return {};
+}
+
+const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
+ if (!IsAddressValid(addr)) {
+ return {};
+ }
+
+ const u8* const page_pointer{page_table.pointers[addr >> page_bits]};
+ if (page_pointer != nullptr) {
return page_pointer + (addr & page_mask);
}
@@ -188,15 +199,86 @@ u8* MemoryManager::GetPointer(GPUVAddr addr) {
return {};
}
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) {
- std::memcpy(dest_buffer, GetPointer(src_addr), size);
+void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
+ std::size_t remaining_size{size};
+ std::size_t page_index{src_addr >> page_bits};
+ std::size_t page_offset{src_addr & page_mask};
+
+ while (remaining_size > 0) {
+ const std::size_t copy_amount{
+ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+ switch (page_table.attributes[page_index]) {
+ case Common::PageType::Memory: {
+ const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+ rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+ std::memcpy(dest_buffer, src_ptr, copy_amount);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ page_index++;
+ page_offset = 0;
+ dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+ remaining_size -= copy_amount;
+ }
}
+
void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
- std::memcpy(GetPointer(dest_addr), src_buffer, size);
+ std::size_t remaining_size{size};
+ std::size_t page_index{dest_addr >> page_bits};
+ std::size_t page_offset{dest_addr & page_mask};
+
+ while (remaining_size > 0) {
+ const std::size_t copy_amount{
+ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+ switch (page_table.attributes[page_index]) {
+ case Common::PageType::Memory: {
+ u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+ rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
+ std::memcpy(dest_ptr, src_buffer, copy_amount);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ page_index++;
+ page_offset = 0;
+ src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+ remaining_size -= copy_amount;
+ }
}
void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
- std::memcpy(GetPointer(dest_addr), GetPointer(src_addr), size);
+ std::size_t remaining_size{size};
+ std::size_t page_index{src_addr >> page_bits};
+ std::size_t page_offset{src_addr & page_mask};
+
+ while (remaining_size > 0) {
+ const std::size_t copy_amount{
+ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+ switch (page_table.attributes[page_index]) {
+ case Common::PageType::Memory: {
+ const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+ rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+ WriteBlock(dest_addr, src_ptr, copy_amount);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ page_index++;
+ page_offset = 0;
+ dest_addr += static_cast<VAddr>(copy_amount);
+ src_addr += static_cast<VAddr>(copy_amount);
+ remaining_size -= copy_amount;
+ }
}
void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
@@ -336,7 +418,7 @@ MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) {
const VirtualMemoryArea& vma{vma_handle->second};
if (vma.type == VirtualMemoryArea::Type::Mapped) {
// Region is already allocated
- return {};
+ return vma_handle;
}
const VAddr start_in_vma{base - vma.base};
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 34744bb27..647cbf93a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -10,6 +10,10 @@
#include "common/common_types.h"
#include "common/page_table.h"
+namespace VideoCore {
+class RasterizerInterface;
+}
+
namespace Tegra {
/**
@@ -43,24 +47,25 @@ struct VirtualMemoryArea {
class MemoryManager final {
public:
- MemoryManager();
+ MemoryManager(VideoCore::RasterizerInterface& rasterizer);
GPUVAddr AllocateSpace(u64 size, u64 align);
GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
GPUVAddr MapBufferEx(VAddr cpu_addr, u64 size);
GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr addr, u64 size);
GPUVAddr UnmapBuffer(GPUVAddr addr, u64 size);
- std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr);
+ std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
template <typename T>
- T Read(GPUVAddr addr);
+ T Read(GPUVAddr addr) const;
template <typename T>
void Write(GPUVAddr addr, T data);
u8* GetPointer(GPUVAddr addr);
+ const u8* GetPointer(GPUVAddr addr) const;
- void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size);
+ void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
@@ -127,7 +132,7 @@ private:
void UpdatePageTableForVMA(const VirtualMemoryArea& vma);
/// Finds a free (unmapped region) of the specified size starting at the specified address.
- GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size);
+ GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size) const;
private:
static constexpr u64 page_bits{16};
@@ -143,6 +148,7 @@ private:
Common::PageTable page_table{page_bits};
VMAMap vma_map;
+ VideoCore::RasterizerInterface& rasterizer;
};
} // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index fd091c84c..25652e794 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,7 +7,7 @@
#include "common/alignment.h"
#include "core/core.h"
-#include "core/memory.h"
+#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index da9326253..ea4a593af 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -4,9 +4,9 @@
#include <glad/glad.h>
-#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
+#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_global_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -14,28 +14,28 @@
namespace OpenGL {
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
- : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} {
+CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
+ : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
+ max_size{max_size} {
buffer.Create();
- // Bind and unbind the buffer so it gets allocated by the driver
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
}
-void CachedGlobalRegion::Reload(u32 size_) {
- constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize);
+CachedGlobalRegion::~CachedGlobalRegion() = default;
+void CachedGlobalRegion::Reload(u32 size_) {
size = size_;
if (size > max_size) {
size = max_size;
- LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_,
+ LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
max_size);
}
+ glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
+}
- // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
- glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
+void CachedGlobalRegion::Flush() {
+ LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
+ glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
}
GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
@@ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr,
return search->second;
}
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size,
- u8* host_ptr) {
+GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
+ u32 size) {
GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
if (!region) {
// No reserved surface available, create a new one and reserve it
auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
- const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr);
- region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr);
+ const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
+ ASSERT(cpu_addr);
+
+ region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
ReserveGlobalRegion(region);
}
region->Reload(size);
@@ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
}
GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
- : RasterizerCache{rasterizer} {}
+ : RasterizerCache{rasterizer} {
+ GLint max_ssbo_size_;
+ glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
+ max_ssbo_size = static_cast<u32>(max_ssbo_size_);
+}
GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
const GLShader::GlobalMemoryEntry& global_region,
@@ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
auto& gpu{Core::System::GetInstance().GPU()};
auto& memory_manager{gpu.MemoryManager()};
- const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]};
+ const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
global_region.GetCbufOffset()};
const auto actual_addr{memory_manager.Read<u64>(addr)};
@@ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
if (!region) {
// No global region found - create a new one
- region = GetUncachedGlobalRegion(actual_addr, size, host_ptr);
+ region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
Register(region);
}
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index 5a21ab66f..196e6e278 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -19,7 +19,7 @@ namespace OpenGL {
namespace GLShader {
class GlobalMemoryEntry;
-} // namespace GLShader
+}
class RasterizerOpenGL;
class CachedGlobalRegion;
@@ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
class CachedGlobalRegion final : public RasterizerCacheObject {
public:
- explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
+ explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
+ ~CachedGlobalRegion();
VAddr GetCpuAddr() const override {
return cpu_addr;
@@ -45,14 +46,14 @@ public:
/// Reloads the global region from guest memory
void Reload(u32 size_);
- // TODO(Rodrigo): When global memory is written (STG), implement flushing
- void Flush() override {
- UNIMPLEMENTED();
- }
+ void Flush() override;
private:
VAddr cpu_addr{};
+ u8* host_ptr{};
u32 size{};
+ u32 max_size{};
+
OGLBuffer buffer;
};
@@ -66,10 +67,11 @@ public:
private:
GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
- GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr);
+ GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
void ReserveGlobalRegion(GlobalRegion region);
std::unordered_map<CacheAddr, GlobalRegion> reserve;
+ u32 max_ssbo_size{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
index 2bcbd3da2..c3e94d917 100644
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
@@ -7,7 +7,7 @@
#include "common/assert.h"
#include "common/common_types.h"
#include "core/core.h"
-#include "core/memory.h"
+#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_primitive_assembler.h"
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h
index 0e2e7dc36..4e87ce4d6 100644
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.h
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.h
@@ -4,11 +4,9 @@
#pragma once
-#include <vector>
#include <glad/glad.h>
#include "common/common_types.h"
-#include "video_core/memory_manager.h"
namespace OpenGL {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 046fc935b..0b1fe3494 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -17,7 +17,6 @@
#include "common/microprofile.h"
#include "common/scope_exit.h"
#include "core/core.h"
-#include "core/frontend/emu_window.h"
#include "core/hle/kernel/process.h"
#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
@@ -26,7 +25,6 @@
#include "video_core/renderer_opengl/gl_shader_gen.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
-#include "video_core/video_core.h"
namespace OpenGL {
@@ -301,6 +299,10 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
BaseBindings base_bindings;
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+ // Prepare packed bindings
+ bind_ubo_pushbuffer.Setup(base_bindings.cbuf);
+ bind_ssbo_pushbuffer.Setup(base_bindings.gmem);
+
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
const auto& shader_config = gpu.regs.shader_config[index];
const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -318,13 +320,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
GLShader::MaxwellUniformData ubo{};
- ubo.SetFromRegs(gpu.state.shader_stages[stage]);
+ ubo.SetFromRegs(gpu, stage);
const GLintptr offset = buffer_cache.UploadHostMemory(
&ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
// Bind the emulation info buffer
- glBindBufferRange(GL_UNIFORM_BUFFER, base_bindings.cbuf, buffer_cache.GetHandle(), offset,
- static_cast<GLsizeiptr>(sizeof(ubo)));
+ bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
+ static_cast<GLsizeiptr>(sizeof(ubo)));
Shader shader{shader_cache.GetStageProgram(program)};
const auto [program_handle, next_bindings] =
@@ -368,6 +370,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
base_bindings = next_bindings;
}
+ bind_ubo_pushbuffer.Bind();
+ bind_ssbo_pushbuffer.Bind();
+
SyncClipEnabled(clip_distances);
gpu.dirty_flags.shaders = false;
@@ -577,9 +582,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
}
void RasterizerOpenGL::Clear() {
- const auto prev_state{state};
- SCOPE_EXIT({ prev_state.Apply(); });
-
const auto& regs = system.GPU().Maxwell3D().regs;
bool use_color{};
bool use_depth{};
@@ -651,7 +653,10 @@ void RasterizerOpenGL::Clear() {
clear_state.EmulateViewportWithScissor();
}
- clear_state.Apply();
+ clear_state.ApplyColorMask();
+ clear_state.ApplyDepth();
+ clear_state.ApplyStencilTest();
+ clear_state.ApplyViewport();
if (use_color) {
glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@@ -751,6 +756,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
return;
}
res_cache.FlushRegion(addr, size);
+ global_cache.FlushRegion(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -902,23 +908,14 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
const auto& entries = shader->GetShaderEntries().const_buffers;
- constexpr u64 max_binds = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers;
- std::array<GLuint, max_binds> bind_buffers;
- std::array<GLintptr, max_binds> bind_offsets;
- std::array<GLsizeiptr, max_binds> bind_sizes;
-
- ASSERT_MSG(entries.size() <= max_binds, "Exceeded expected number of binding points.");
-
// Upload only the enabled buffers from the 16 constbuffers of each shader stage
for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
const auto& used_buffer = entries[bindpoint];
const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
if (!buffer.enabled) {
- // With disabled buffers set values as zero to unbind them
- bind_buffers[bindpoint] = 0;
- bind_offsets[bindpoint] = 0;
- bind_sizes[bindpoint] = 0;
+ // Set values to zero to unbind buffers
+ bind_ubo_pushbuffer.Push(0, 0, 0);
continue;
}
@@ -946,30 +943,22 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
const GLintptr const_buffer_offset = buffer_cache.UploadMemory(
buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
- // Prepare values for multibind
- bind_buffers[bindpoint] = buffer_cache.GetHandle();
- bind_offsets[bindpoint] = const_buffer_offset;
- bind_sizes[bindpoint] = size;
+ bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
}
-
- // The first binding is reserved for emulation values
- const GLuint ubo_base_binding = base_bindings.cbuf + 1;
- glBindBuffersRange(GL_UNIFORM_BUFFER, ubo_base_binding, static_cast<GLsizei>(entries.size()),
- bind_buffers.data(), bind_offsets.data(), bind_sizes.data());
}
void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader, GLenum primitive_mode,
BaseBindings base_bindings) {
- // TODO(Rodrigo): Use ARB_multi_bind here
const auto& entries = shader->GetShaderEntries().global_memory_entries;
-
- for (u32 bindpoint = 0; bindpoint < static_cast<u32>(entries.size()); ++bindpoint) {
- const auto& entry = entries[bindpoint];
- const u32 current_bindpoint = base_bindings.gmem + bindpoint;
- const auto& region = global_cache.GetGlobalRegion(entry, stage);
-
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, current_bindpoint, region->GetBufferHandle());
+ for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
+ const auto& entry{entries[bindpoint]};
+ const auto& region{global_cache.GetGlobalRegion(entry, stage)};
+ if (entry.IsWritten()) {
+ region->MarkAsModified(true, global_cache);
+ }
+ bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
+ static_cast<GLsizeiptr>(region->GetSizeInBytes()));
}
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 4de565321..d4c2cf80e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -12,15 +12,12 @@
#include <optional>
#include <tuple>
#include <utility>
-#include <vector>
#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range.hpp>
#include <glad/glad.h>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/memory_manager.h"
#include "video_core/rasterizer_cache.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -29,10 +26,9 @@
#include "video_core/renderer_opengl/gl_rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state.h"
-#include "video_core/renderer_opengl/gl_stream_buffer.h"
+#include "video_core/renderer_opengl/utils.h"
namespace Core {
class System;
@@ -75,10 +71,6 @@ public:
static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
"The maximum size of a constbuffer must be a multiple of the size of GLvec4");
- static constexpr std::size_t MaxGlobalMemorySize = 0x10000;
- static_assert(MaxGlobalMemorySize % sizeof(float) == 0,
- "The maximum size of a global memory must be a multiple of the size of float");
-
private:
class SamplerInfo {
public:
@@ -234,6 +226,9 @@ private:
PrimitiveAssembler primitive_assembler{buffer_cache};
GLint uniform_buffer_alignment;
+ BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
+ BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
+
std::size_t CalculateVertexArraysSize() const;
std::size_t CalculateIndexBufferSize() const;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index aba6ce731..7a68b8738 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -13,9 +13,9 @@
#include "common/scope_exit.h"
#include "core/core.h"
#include "core/hle/kernel/process.h"
-#include "core/memory.h"
#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
#include "video_core/morton.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_rasterizer_cache.h"
@@ -112,11 +112,26 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,
params.pixel_format = PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value(),
params.srgb_conversion);
- if (params.pixel_format == PixelFormat::R16U && config.tsc.depth_compare_enabled) {
+ if (config.tsc.depth_compare_enabled) {
// Some titles create a 'R16U' (normalized 16-bit) texture with depth_compare enabled,
// then attempt to sample from it via a shadow sampler. Convert format to Z16 (which also
// causes GetFormatType to properly return 'Depth' below).
- params.pixel_format = PixelFormat::Z16;
+ if (GetFormatType(params.pixel_format) == SurfaceType::ColorTexture) {
+ switch (params.pixel_format) {
+ case PixelFormat::R16S:
+ case PixelFormat::R16U:
+ case PixelFormat::R16F:
+ params.pixel_format = PixelFormat::Z16;
+ break;
+ case PixelFormat::R32F:
+ params.pixel_format = PixelFormat::Z32F;
+ break;
+ default:
+ LOG_WARNING(HW_GPU, "Color texture format being used with depth compare: {}",
+ static_cast<u32>(params.pixel_format));
+ break;
+ }
+ }
}
params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());
@@ -266,6 +281,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,
params.component_type = ComponentTypeFromRenderTarget(config.format);
params.type = GetFormatType(params.pixel_format);
params.width = config.width;
+ params.pitch = config.pitch;
params.height = config.height;
params.unaligned_height = config.height;
params.target = SurfaceTarget::Texture2D;
@@ -662,8 +678,8 @@ void CachedSurface::FlushGLBuffer() {
gl_buffer[0].resize(GetSizeInBytes());
const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
- // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
- ASSERT(params.width * GetBytesPerPixel(params.pixel_format) % 4 == 0);
+ const u32 align = std::clamp(params.RowAlign(0), 1U, 8U);
+ glPixelStorei(GL_PACK_ALIGNMENT, align);
glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width));
ASSERT(!tuple.compressed);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -708,8 +724,8 @@ void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
- // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
- ASSERT(params.MipWidth(mip_map) * GetBytesPerPixel(params.pixel_format) % 4 == 0);
+ const u32 align = std::clamp(params.RowAlign(mip_map), 1U, 8U);
+ glPixelStorei(GL_UNPACK_ALIGNMENT, align);
glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.MipWidth(mip_map)));
const auto image_size = static_cast<GLsizei>(params.GetMipmapSizeGL(mip_map, false));
@@ -1175,10 +1191,16 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
return new_surface;
}
+ const bool old_compressed =
+ GetFormatTuple(old_params.pixel_format, old_params.component_type).compressed;
+ const bool new_compressed =
+ GetFormatTuple(new_params.pixel_format, new_params.component_type).compressed;
+ const bool compatible_formats =
+ GetFormatBpp(old_params.pixel_format) == GetFormatBpp(new_params.pixel_format) &&
+ !(old_compressed || new_compressed);
// For compatible surfaces, we can just do fast glCopyImageSubData based copy
- if (old_params.target == new_params.target && old_params.type == new_params.type &&
- old_params.depth == new_params.depth && old_params.depth == 1 &&
- GetFormatBpp(old_params.pixel_format) == GetFormatBpp(new_params.pixel_format)) {
+ if (old_params.target == new_params.target && old_params.depth == new_params.depth &&
+ old_params.depth == 1 && compatible_formats) {
FastCopySurface(old_surface, new_surface);
return new_surface;
}
@@ -1193,7 +1215,7 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
case SurfaceTarget::TextureCubemap:
case SurfaceTarget::Texture2DArray:
case SurfaceTarget::TextureCubeArray:
- if (old_params.pixel_format == new_params.pixel_format)
+ if (compatible_formats)
FastLayeredCopySurface(old_surface, new_surface);
else {
AccurateCopySurface(old_surface, new_surface);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index e8073579f..db280dbb3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -5,13 +5,13 @@
#pragma once
#include <array>
-#include <map>
#include <memory>
#include <string>
-#include <unordered_set>
+#include <tuple>
#include <vector>
#include "common/alignment.h"
+#include "common/bit_util.h"
#include "common/common_types.h"
#include "common/hash.h"
#include "common/math_util.h"
@@ -206,6 +206,13 @@ struct SurfaceParams {
return bd;
}
+ u32 RowAlign(u32 mip_level) const {
+ const u32 m_width = MipWidth(mip_level);
+ const u32 bytes_per_pixel = GetBytesPerPixel(pixel_format);
+ const u32 l2 = Common::CountTrailingZeroes32(m_width * bytes_per_pixel);
+ return (1U << l2);
+ }
+
/// Creates SurfaceParams from a texture configuration
static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config,
const GLShader::SamplerEntry& entry);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 290e654bc..99f67494c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -6,13 +6,12 @@
#include "common/assert.h"
#include "common/hash.h"
#include "core/core.h"
-#include "core/memory.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/utils.h"
#include "video_core/shader/shader_ir.h"
@@ -41,6 +40,10 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
/// Gets the shader program code from memory for the specified address
ProgramCode GetShaderCode(const u8* host_ptr) {
ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
+ ASSERT_OR_EXECUTE(host_ptr != nullptr, {
+ std::fill(program_code.begin(), program_code.end(), 0);
+ return program_code;
+ });
std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
return program_code;
}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index fd1c85115..0cf8e0b3d 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -5,21 +5,20 @@
#pragma once
#include <array>
+#include <atomic>
#include <memory>
#include <set>
#include <tuple>
#include <unordered_map>
+#include <vector>
#include <glad/glad.h>
-#include "common/assert.h"
#include "common/common_types.h"
#include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_base.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
namespace Core {
class System;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 11d1169f0..445048daf 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -21,6 +21,8 @@
namespace OpenGL::GLShader {
+namespace {
+
using Tegra::Shader::Attribute;
using Tegra::Shader::AttributeUse;
using Tegra::Shader::Header;
@@ -34,13 +36,15 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
using Operation = const OperationNode&;
+enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
+
+struct TextureAoffi {};
+using TextureArgument = std::pair<Type, Node>;
+using TextureIR = std::variant<TextureAoffi, TextureArgument>;
+
enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
-constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
- static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
-
-enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
class ShaderWriter {
public:
@@ -69,10 +73,10 @@ public:
shader_source += '\n';
}
- std::string GenerateTemporal() {
- std::string temporal = "tmp";
- temporal += std::to_string(temporal_index++);
- return temporal;
+ std::string GenerateTemporary() {
+ std::string temporary = "tmp";
+ temporary += std::to_string(temporary_index++);
+ return temporary;
}
std::string GetResult() {
@@ -87,11 +91,11 @@ private:
}
std::string shader_source;
- u32 temporal_index = 1;
+ u32 temporary_index = 1;
};
/// Generates code to use for a swizzle operation.
-static std::string GetSwizzle(u32 elem) {
+std::string GetSwizzle(u32 elem) {
ASSERT(elem <= 3);
std::string swizzle = ".";
swizzle += "xyzw"[elem];
@@ -99,7 +103,7 @@ static std::string GetSwizzle(u32 elem) {
}
/// Translate topology
-static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
+std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
switch (topology) {
case Tegra::Shader::OutputTopology::PointList:
return "points";
@@ -114,7 +118,7 @@ static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
}
/// Returns true if an object has to be treated as precise
-static bool IsPrecise(Operation operand) {
+bool IsPrecise(Operation operand) {
const auto& meta = operand.GetMeta();
if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
@@ -126,7 +130,7 @@ static bool IsPrecise(Operation operand) {
return false;
}
-static bool IsPrecise(Node node) {
+bool IsPrecise(Node node) {
if (const auto operation = std::get_if<OperationNode>(node)) {
return IsPrecise(*operation);
}
@@ -202,8 +206,10 @@ public:
for (const auto& sampler : ir.GetSamplers()) {
entries.samplers.emplace_back(sampler);
}
- for (const auto& gmem : ir.GetGlobalMemoryBases()) {
- entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
+ for (const auto& gmem_pair : ir.GetGlobalMemory()) {
+ const auto& [base, usage] = gmem_pair;
+ entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset,
+ usage.is_read, usage.is_written);
}
entries.clip_distances = ir.GetClipDistances();
entries.shader_length = ir.GetLength();
@@ -374,12 +380,22 @@ private:
}
void DeclareGlobalMemory() {
- for (const auto& entry : ir.GetGlobalMemoryBases()) {
+ for (const auto& gmem : ir.GetGlobalMemory()) {
+ const auto& [base, usage] = gmem;
+
+ // Since we don't know how the shader will use the shader, hint the driver to disable as
+ // much optimizations as possible
+ std::string qualifier = "coherent volatile";
+ if (usage.is_read && !usage.is_written)
+ qualifier += " readonly";
+ else if (usage.is_written && !usage.is_read)
+ qualifier += " writeonly";
+
const std::string binding =
- fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset);
- code.AddLine("layout (std430, binding = " + binding + ") buffer " +
- GetGlobalMemoryBlock(entry) + " {");
- code.AddLine(" float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
+ fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset);
+ code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " +
+ GetGlobalMemoryBlock(base) + " {");
+ code.AddLine(" float " + GetGlobalMemory(base) + "[];");
code.AddLine("};");
code.AddNewLine();
}
@@ -426,9 +442,14 @@ private:
std::string Visit(Node node) {
if (const auto operation = std::get_if<OperationNode>(node)) {
const auto operation_index = static_cast<std::size_t>(operation->GetCode());
+ if (operation_index >= operation_decompilers.size()) {
+ UNREACHABLE_MSG("Out of bounds operation: {}", operation_index);
+ return {};
+ }
const auto decompiler = operation_decompilers[operation_index];
if (decompiler == nullptr) {
- UNREACHABLE_MSG("Operation decompiler {} not defined", operation_index);
+ UNREACHABLE_MSG("Undefined operation: {}", operation_index);
+ return {};
}
return (this->*decompiler)(*operation);
@@ -540,9 +561,8 @@ private:
} else if (std::holds_alternative<OperationNode>(*offset)) {
// Indirect access
- const std::string final_offset = code.GenerateTemporal();
- code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4) & " +
- std::to_string(MAX_CONSTBUFFER_ELEMENTS - 1) + ';');
+ const std::string final_offset = code.GenerateTemporary();
+ code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4);");
return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()),
final_offset, final_offset);
@@ -587,9 +607,9 @@ private:
// There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
- const std::string temporal = code.GenerateTemporal();
- code.AddLine(precise + "float " + temporal + " = " + value + ';');
- return temporal;
+ const std::string temporary = code.GenerateTemporary();
+ code.AddLine(precise + "float " + temporary + " = " + value + ';');
+ return temporary;
}
std::string VisitOperand(Operation operation, std::size_t operand_index) {
@@ -601,9 +621,9 @@ private:
return Visit(operand);
}
- const std::string temporal = code.GenerateTemporal();
- code.AddLine("float " + temporal + " = " + Visit(operand) + ';');
- return temporal;
+ const std::string temporary = code.GenerateTemporary();
+ code.AddLine("float " + temporary + " = " + Visit(operand) + ';');
+ return temporary;
}
std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
@@ -718,8 +738,8 @@ private:
result_type));
}
- std::string GenerateTexture(Operation operation, const std::string& func,
- const std::vector<std::pair<Type, Node>>& extras) {
+ std::string GenerateTexture(Operation operation, const std::string& function_suffix,
+ const std::vector<TextureIR>& extras) {
constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"};
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
@@ -729,11 +749,11 @@ private:
const bool has_array = meta->sampler.IsArray();
const bool has_shadow = meta->sampler.IsShadow();
- std::string expr = func;
- expr += '(';
- expr += GetSampler(meta->sampler);
- expr += ", ";
-
+ std::string expr = "texture" + function_suffix;
+ if (!meta->aoffi.empty()) {
+ expr += "Offset";
+ }
+ expr += '(' + GetSampler(meta->sampler) + ", ";
expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
expr += '(';
for (std::size_t i = 0; i < count; ++i) {
@@ -751,36 +771,74 @@ private:
}
expr += ')';
- for (const auto& extra_pair : extras) {
- const auto [type, operand] = extra_pair;
- if (operand == nullptr) {
- continue;
+ for (const auto& variant : extras) {
+ if (const auto argument = std::get_if<TextureArgument>(&variant)) {
+ expr += GenerateTextureArgument(*argument);
+ } else if (std::get_if<TextureAoffi>(&variant)) {
+ expr += GenerateTextureAoffi(meta->aoffi);
+ } else {
+ UNREACHABLE();
}
- expr += ", ";
+ }
- switch (type) {
- case Type::Int:
- if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
- // Inline the string as an immediate integer in GLSL (some extra arguments are
- // required to be constant)
- expr += std::to_string(static_cast<s32>(immediate->GetValue()));
- } else {
- expr += "ftoi(" + Visit(operand) + ')';
- }
- break;
- case Type::Float:
- expr += Visit(operand);
- break;
- default: {
- const auto type_int = static_cast<u32>(type);
- UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
- expr += '0';
- break;
+ return expr + ')';
+ }
+
+ std::string GenerateTextureArgument(TextureArgument argument) {
+ const auto [type, operand] = argument;
+ if (operand == nullptr) {
+ return {};
+ }
+
+ std::string expr = ", ";
+ switch (type) {
+ case Type::Int:
+ if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+ // Inline the string as an immediate integer in GLSL (some extra arguments are
+ // required to be constant)
+ expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+ } else {
+ expr += "ftoi(" + Visit(operand) + ')';
+ }
+ break;
+ case Type::Float:
+ expr += Visit(operand);
+ break;
+ default: {
+ const auto type_int = static_cast<u32>(type);
+ UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
+ expr += '0';
+ break;
+ }
+ }
+ return expr;
+ }
+
+ std::string GenerateTextureAoffi(const std::vector<Node>& aoffi) {
+ if (aoffi.empty()) {
+ return {};
+ }
+ constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"};
+ std::string expr = ", ";
+ expr += coord_constructors.at(aoffi.size() - 1);
+ expr += '(';
+
+ for (std::size_t index = 0; index < aoffi.size(); ++index) {
+ const auto operand{aoffi.at(index)};
+ if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+ // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
+ // to be constant by the standard).
+ expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+ } else {
+ expr += "ftoi(" + Visit(operand) + ')';
}
+ if (index + 1 < aoffi.size()) {
+ expr += ", ";
}
}
+ expr += ')';
- return expr + ')';
+ return expr;
}
std::string Assign(Operation operation) {
@@ -820,6 +878,12 @@ private:
} else if (const auto lmem = std::get_if<LmemNode>(dest)) {
target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
+ } else if (const auto gmem = std::get_if<GmemNode>(dest)) {
+ const std::string real = Visit(gmem->GetRealAddress());
+ const std::string base = Visit(gmem->GetBaseAddress());
+ const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+ target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+
} else {
UNREACHABLE_MSG("Assign called without a proper target");
}
@@ -1159,7 +1223,8 @@ private:
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
ASSERT(meta);
- std::string expr = GenerateTexture(operation, "texture", {{Type::Float, meta->bias}});
+ std::string expr = GenerateTexture(
+ operation, "", {TextureAoffi{}, TextureArgument{Type::Float, meta->bias}});
if (meta->sampler.IsShadow()) {
expr = "vec4(" + expr + ')';
}
@@ -1170,7 +1235,8 @@ private:
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
ASSERT(meta);
- std::string expr = GenerateTexture(operation, "textureLod", {{Type::Float, meta->lod}});
+ std::string expr = GenerateTexture(
+ operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureAoffi{}});
if (meta->sampler.IsShadow()) {
expr = "vec4(" + expr + ')';
}
@@ -1182,7 +1248,8 @@ private:
ASSERT(meta);
const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int;
- return GenerateTexture(operation, "textureGather", {{type, meta->component}}) +
+ return GenerateTexture(operation, "Gather",
+ {TextureArgument{type, meta->component}, TextureAoffi{}}) +
GetSwizzle(meta->element);
}
@@ -1196,11 +1263,12 @@ private:
switch (meta->element) {
case 0:
case 1:
- return "textureSize(" + sampler + ", " + lod + ')' + GetSwizzle(meta->element);
+ return "itof(int(textureSize(" + sampler + ", " + lod + ')' +
+ GetSwizzle(meta->element) + "))";
case 2:
return "0";
case 3:
- return "textureQueryLevels(" + sampler + ')';
+ return "itof(textureQueryLevels(" + sampler + "))";
}
UNREACHABLE();
return "0";
@@ -1211,8 +1279,8 @@ private:
ASSERT(meta);
if (meta->element < 2) {
- return "itof(int((" + GenerateTexture(operation, "textureQueryLod", {}) +
- " * vec2(256))" + GetSwizzle(meta->element) + "))";
+ return "itof(int((" + GenerateTexture(operation, "QueryLod", {}) + " * vec2(256))" +
+ GetSwizzle(meta->element) + "))";
}
return "0";
}
@@ -1565,11 +1633,11 @@ private:
ShaderWriter code;
};
+} // Anonymous namespace
+
std::string GetCommonDeclarations() {
const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
- const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
- "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
"#define ftoi floatBitsToInt\n"
"#define ftou floatBitsToUint\n"
"#define itof intBitsToFloat\n"
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 72aca4938..55b3d4d7b 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -5,7 +5,6 @@
#pragma once
#include <array>
-#include <set>
#include <string>
#include <utility>
#include <vector>
@@ -40,8 +39,9 @@ private:
class GlobalMemoryEntry {
public:
- explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset)
- : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
+ explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
+ : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
+ is_written} {}
u32 GetCbufIndex() const {
return cbuf_index;
@@ -51,9 +51,19 @@ public:
return cbuf_offset;
}
+ bool IsRead() const {
+ return is_read;
+ }
+
+ bool IsWritten() const {
+ return is_written;
+ }
+
private:
u32 cbuf_index{};
u32 cbuf_offset{};
+ bool is_read{};
+ bool is_written{};
};
struct ShaderEntries {
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 82fc4d44b..d5890a375 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -4,7 +4,6 @@
#include <cstring>
#include <fmt/format.h>
-#include <lz4.h>
#include "common/assert.h"
#include "common/common_paths.h"
@@ -12,6 +11,7 @@
#include "common/file_util.h"
#include "common/logging/log.h"
#include "common/scm_rev.h"
+#include "common/zstd_compression.h"
#include "core/core.h"
#include "core/hle/kernel/process.h"
@@ -49,39 +49,6 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
return hash;
}
-template <typename T>
-std::vector<u8> CompressData(const T* source, std::size_t source_size) {
- if (source_size > LZ4_MAX_INPUT_SIZE) {
- // Source size exceeds LZ4 maximum input size
- return {};
- }
- const auto source_size_int = static_cast<int>(source_size);
- const int max_compressed_size = LZ4_compressBound(source_size_int);
- std::vector<u8> compressed(max_compressed_size);
- const int compressed_size = LZ4_compress_default(reinterpret_cast<const char*>(source),
- reinterpret_cast<char*>(compressed.data()),
- source_size_int, max_compressed_size);
- if (compressed_size <= 0) {
- // Compression failed
- return {};
- }
- compressed.resize(compressed_size);
- return compressed;
-}
-
-std::vector<u8> DecompressData(const std::vector<u8>& compressed, std::size_t uncompressed_size) {
- std::vector<u8> uncompressed(uncompressed_size);
- const int size_check = LZ4_decompress_safe(reinterpret_cast<const char*>(compressed.data()),
- reinterpret_cast<char*>(uncompressed.data()),
- static_cast<int>(compressed.size()),
- static_cast<int>(uncompressed.size()));
- if (static_cast<int>(uncompressed_size) != size_check) {
- // Decompression failed
- return {};
- }
- return uncompressed;
-}
-
} // namespace
ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
@@ -292,7 +259,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
return {};
}
- dump.binary = DecompressData(compressed_binary, binary_length);
+ dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary);
if (dump.binary.empty()) {
return {};
}
@@ -321,7 +288,7 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
return {};
}
- const std::vector<u8> code = DecompressData(compressed_code, code_size);
+ const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code);
if (code.empty()) {
return {};
}
@@ -370,11 +337,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
for (u32 i = 0; i < global_memory_count; ++i) {
u32 cbuf_index{};
u32 cbuf_offset{};
+ u8 is_read{};
+ u8 is_written{};
if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
- file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) {
+ file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) ||
+ file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) ||
+ file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) {
return {};
}
- entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset);
+ entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
+ is_written != 0);
}
for (auto& clip_distance : entry.entries.clip_distances) {
@@ -430,7 +402,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu
return false;
for (const auto& gmem : entries.global_memory_entries) {
if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
- file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) {
+ file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 ||
+ file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 ||
+ file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) {
return false;
}
}
@@ -507,7 +481,8 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str
if (!IsUsable())
return;
- const std::vector<u8> compressed_code{CompressData(code.data(), code.size())};
+ const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault(
+ reinterpret_cast<const u8*>(code.data()), code.size())};
if (compressed_code.empty()) {
LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}",
unique_identifier);
@@ -537,7 +512,9 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
std::vector<u8> binary(binary_length);
glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());
- const std::vector<u8> compressed_binary = CompressData(binary.data(), binary.size());
+ const std::vector<u8> compressed_binary =
+ Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size());
+
if (compressed_binary.empty()) {
LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}",
usage.unique_identifier);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 7d96649af..8763d9c71 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -3,7 +3,6 @@
// Refer to the license.txt file included.
#include <fmt/format.h>
-#include "common/assert.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_gen.h"
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fba8e681b..fad346b48 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -4,12 +4,9 @@
#pragma once
-#include <array>
-#include <string>
#include <vector>
#include "common/common_types.h"
-#include "video_core/engines/shader_bytecode.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/shader/shader_ir.h"
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 6a30c28d2..05ab01dcb 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -2,23 +2,55 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include "core/core.h"
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
namespace OpenGL::GLShader {
-void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) {
- const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
- const auto& regs = gpu.regs;
- const auto& state = gpu.state;
+using Tegra::Engines::Maxwell3D;
+
+ProgramManager::ProgramManager() {
+ pipeline.Create();
+}
+
+ProgramManager::~ProgramManager() = default;
+
+void ProgramManager::ApplyTo(OpenGLState& state) {
+ UpdatePipeline();
+ state.draw.shader_program = 0;
+ state.draw.program_pipeline = pipeline.handle;
+}
+
+void ProgramManager::UpdatePipeline() {
+ // Avoid updating the pipeline when values have no changed
+ if (old_state == current_state) {
+ return;
+ }
+
+ // Workaround for AMD bug
+ constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
+ GL_FRAGMENT_SHADER_BIT};
+ glUseProgramStages(pipeline.handle, all_used_stages, 0);
+
+ glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
+ glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
+ glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+
+ old_state = current_state;
+}
+
+void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shader_stage) {
+ const auto& regs = maxwell.regs;
+ const auto& state = maxwell.state;
// TODO(bunnei): Support more than one viewport
viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
- u32 func = static_cast<u32>(regs.alpha_test_func);
+ auto func{static_cast<u32>(regs.alpha_test_func)};
// Normalize the gl variants of opCompare to be the same as the normal variants
- u32 op_gl_variant_base = static_cast<u32>(Tegra::Engines::Maxwell3D::Regs::ComparisonOp::Never);
+ const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never);
if (func >= op_gl_variant_base) {
func = func - op_gl_variant_base + 1U;
}
@@ -31,8 +63,9 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh
// Assign in which stage the position has to be flipped
// (the last stage before the fragment shader).
- if (gpu.regs.shader_config[static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry)].enable) {
- flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry);
+ constexpr u32 geometry_index = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry);
+ if (maxwell.regs.shader_config[geometry_index].enable) {
+ flip_stage = geometry_index;
} else {
flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB);
}
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 4970aafed..cec18a832 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -4,6 +4,8 @@
#pragma once
+#include <cstddef>
+
#include <glad/glad.h>
#include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -12,14 +14,13 @@
namespace OpenGL::GLShader {
-using Tegra::Engines::Maxwell3D;
-
/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
-// NOTE: Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
-// the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
-// Not following that rule will cause problems on some AMD drivers.
+/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
+/// the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
+/// Not following that rule will cause problems on some AMD drivers.
struct MaxwellUniformData {
- void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage);
+ void SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell, std::size_t shader_stage);
+
alignas(16) GLvec4 viewport_flip;
struct alignas(16) {
GLuint instance_id;
@@ -39,56 +40,48 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
class ProgramManager {
public:
- ProgramManager() {
- pipeline.Create();
- }
+ explicit ProgramManager();
+ ~ProgramManager();
+
+ void ApplyTo(OpenGLState& state);
void UseProgrammableVertexShader(GLuint program) {
- vs = program;
+ current_state.vertex_shader = program;
}
void UseProgrammableGeometryShader(GLuint program) {
- gs = program;
+ current_state.geometry_shader = program;
}
void UseProgrammableFragmentShader(GLuint program) {
- fs = program;
+ current_state.fragment_shader = program;
}
void UseTrivialGeometryShader() {
- gs = 0;
- }
-
- void ApplyTo(OpenGLState& state) {
- UpdatePipeline();
- state.draw.shader_program = 0;
- state.draw.program_pipeline = pipeline.handle;
- state.geometry_shaders.enabled = (gs != 0);
+ current_state.geometry_shader = 0;
}
private:
- void UpdatePipeline() {
- // Avoid updating the pipeline when values have no changed
- if (old_vs == vs && old_fs == fs && old_gs == gs)
- return;
- // Workaround for AMD bug
- glUseProgramStages(pipeline.handle,
- GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | GL_FRAGMENT_SHADER_BIT,
- 0);
-
- glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vs);
- glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, gs);
- glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fs);
-
- // Update the old values
- old_vs = vs;
- old_fs = fs;
- old_gs = gs;
- }
+ struct PipelineState {
+ bool operator==(const PipelineState& rhs) const {
+ return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
+ geometry_shader == rhs.geometry_shader;
+ }
+
+ bool operator!=(const PipelineState& rhs) const {
+ return !operator==(rhs);
+ }
+
+ GLuint vertex_shader{};
+ GLuint fragment_shader{};
+ GLuint geometry_shader{};
+ };
+
+ void UpdatePipeline();
OGLPipeline pipeline;
- GLuint vs{}, fs{}, gs{};
- GLuint old_vs{}, old_fs{}, old_gs{};
+ PipelineState current_state;
+ PipelineState old_state;
};
} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 9419326a3..52d569a1b 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -10,16 +10,62 @@
namespace OpenGL {
-OpenGLState OpenGLState::cur_state;
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+OpenGLState OpenGLState::cur_state;
bool OpenGLState::s_rgb_used;
+namespace {
+
+template <typename T>
+bool UpdateValue(T& current_value, const T new_value) {
+ const bool changed = current_value != new_value;
+ current_value = new_value;
+ return changed;
+}
+
+template <typename T1, typename T2>
+bool UpdateTie(T1 current_value, const T2 new_value) {
+ const bool changed = current_value != new_value;
+ current_value = new_value;
+ return changed;
+}
+
+void Enable(GLenum cap, bool enable) {
+ if (enable) {
+ glEnable(cap);
+ } else {
+ glDisable(cap);
+ }
+}
+
+void Enable(GLenum cap, GLuint index, bool enable) {
+ if (enable) {
+ glEnablei(cap, index);
+ } else {
+ glDisablei(cap, index);
+ }
+}
+
+void Enable(GLenum cap, bool& current_value, bool new_value) {
+ if (UpdateValue(current_value, new_value))
+ Enable(cap, new_value);
+}
+
+void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) {
+ if (UpdateValue(current_value, new_value))
+ Enable(cap, index, new_value);
+}
+
+} // namespace
+
OpenGLState::OpenGLState() {
// These all match default OpenGL values
- geometry_shaders.enabled = false;
framebuffer_srgb.enabled = false;
+
multisample_control.alpha_to_coverage = false;
multisample_control.alpha_to_one = false;
+
cull.enabled = false;
cull.mode = GL_BACK;
cull.front_face = GL_CCW;
@@ -30,14 +76,15 @@ OpenGLState::OpenGLState() {
primitive_restart.enabled = false;
primitive_restart.index = 0;
+
for (auto& item : color_mask) {
item.red_enabled = GL_TRUE;
item.green_enabled = GL_TRUE;
item.blue_enabled = GL_TRUE;
item.alpha_enabled = GL_TRUE;
}
- stencil.test_enabled = false;
- auto reset_stencil = [](auto& config) {
+
+ const auto ResetStencil = [](auto& config) {
config.test_func = GL_ALWAYS;
config.test_ref = 0;
config.test_mask = 0xFFFFFFFF;
@@ -46,8 +93,10 @@ OpenGLState::OpenGLState() {
config.action_depth_pass = GL_KEEP;
config.action_stencil_fail = GL_KEEP;
};
- reset_stencil(stencil.front);
- reset_stencil(stencil.back);
+ stencil.test_enabled = false;
+ ResetStencil(stencil.front);
+ ResetStencil(stencil.back);
+
for (auto& item : viewports) {
item.x = 0;
item.y = 0;
@@ -61,6 +110,7 @@ OpenGLState::OpenGLState() {
item.scissor.width = 0;
item.scissor.height = 0;
}
+
for (auto& item : blend) {
item.enabled = true;
item.rgb_equation = GL_FUNC_ADD;
@@ -70,11 +120,14 @@ OpenGLState::OpenGLState() {
item.src_a_func = GL_ONE;
item.dst_a_func = GL_ZERO;
}
+
independant_blend.enabled = false;
+
blend_color.red = 0.0f;
blend_color.green = 0.0f;
blend_color.blue = 0.0f;
blend_color.alpha = 0.0f;
+
logic_op.enabled = false;
logic_op.operation = GL_COPY;
@@ -91,9 +144,12 @@ OpenGLState::OpenGLState() {
clip_distance = {};
point.size = 1;
+
fragment_color_clamp.enabled = false;
+
depth_clamp.far_plane = false;
depth_clamp.near_plane = false;
+
polygon_offset.fill_enable = false;
polygon_offset.line_enable = false;
polygon_offset.point_enable = false;
@@ -103,260 +159,255 @@ OpenGLState::OpenGLState() {
}
void OpenGLState::ApplyDefaultState() {
+ glEnable(GL_BLEND);
glDisable(GL_FRAMEBUFFER_SRGB);
glDisable(GL_CULL_FACE);
glDisable(GL_DEPTH_TEST);
glDisable(GL_PRIMITIVE_RESTART);
glDisable(GL_STENCIL_TEST);
- glEnable(GL_BLEND);
glDisable(GL_COLOR_LOGIC_OP);
glDisable(GL_SCISSOR_TEST);
}
+void OpenGLState::ApplyFramebufferState() const {
+ if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) {
+ glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
+ }
+ if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) {
+ glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
+ }
+}
+
+void OpenGLState::ApplyVertexArrayState() const {
+ if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) {
+ glBindVertexArray(draw.vertex_array);
+ }
+}
+
+void OpenGLState::ApplyShaderProgram() const {
+ if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) {
+ glUseProgram(draw.shader_program);
+ }
+}
+
+void OpenGLState::ApplyProgramPipeline() const {
+ if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) {
+ glBindProgramPipeline(draw.program_pipeline);
+ }
+}
+
+void OpenGLState::ApplyClipDistances() const {
+ for (std::size_t i = 0; i < clip_distance.size(); ++i) {
+ Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i],
+ clip_distance[i]);
+ }
+}
+
+void OpenGLState::ApplyPointSize() const {
+ if (UpdateValue(cur_state.point.size, point.size)) {
+ glPointSize(point.size);
+ }
+}
+
+void OpenGLState::ApplyFragmentColorClamp() const {
+ if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) {
+ glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
+ fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
+ }
+}
+
+void OpenGLState::ApplyMultisample() const {
+ Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage,
+ multisample_control.alpha_to_coverage);
+ Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one,
+ multisample_control.alpha_to_one);
+}
+
+void OpenGLState::ApplyDepthClamp() const {
+ if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
+ depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
+ return;
+ }
+ cur_state.depth_clamp = depth_clamp;
+
+ UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
+ "Unimplemented Depth Clamp Separation!");
+
+ Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane);
+}
+
void OpenGLState::ApplySRgb() const {
- if (framebuffer_srgb.enabled != cur_state.framebuffer_srgb.enabled) {
- if (framebuffer_srgb.enabled) {
- // Track if sRGB is used
- s_rgb_used = true;
- glEnable(GL_FRAMEBUFFER_SRGB);
- } else {
- glDisable(GL_FRAMEBUFFER_SRGB);
- }
+ if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled)
+ return;
+ cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
+ if (framebuffer_srgb.enabled) {
+ // Track if sRGB is used
+ s_rgb_used = true;
+ glEnable(GL_FRAMEBUFFER_SRGB);
+ } else {
+ glDisable(GL_FRAMEBUFFER_SRGB);
}
}
void OpenGLState::ApplyCulling() const {
- if (cull.enabled != cur_state.cull.enabled) {
- if (cull.enabled) {
- glEnable(GL_CULL_FACE);
- } else {
- glDisable(GL_CULL_FACE);
- }
- }
+ Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled);
- if (cull.mode != cur_state.cull.mode) {
+ if (UpdateValue(cur_state.cull.mode, cull.mode)) {
glCullFace(cull.mode);
}
- if (cull.front_face != cur_state.cull.front_face) {
+ if (UpdateValue(cur_state.cull.front_face, cull.front_face)) {
glFrontFace(cull.front_face);
}
}
void OpenGLState::ApplyColorMask() const {
- if (independant_blend.enabled) {
- for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
- const auto& updated = color_mask[i];
- const auto& current = cur_state.color_mask[i];
- if (updated.red_enabled != current.red_enabled ||
- updated.green_enabled != current.green_enabled ||
- updated.blue_enabled != current.blue_enabled ||
- updated.alpha_enabled != current.alpha_enabled) {
- glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
- updated.blue_enabled, updated.alpha_enabled);
- }
- }
- } else {
- const auto& updated = color_mask[0];
- const auto& current = cur_state.color_mask[0];
+ for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
+ const auto& updated = color_mask[i];
+ auto& current = cur_state.color_mask[i];
if (updated.red_enabled != current.red_enabled ||
updated.green_enabled != current.green_enabled ||
updated.blue_enabled != current.blue_enabled ||
updated.alpha_enabled != current.alpha_enabled) {
- glColorMask(updated.red_enabled, updated.green_enabled, updated.blue_enabled,
- updated.alpha_enabled);
+ current = updated;
+ glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
+ updated.blue_enabled, updated.alpha_enabled);
}
}
}
void OpenGLState::ApplyDepth() const {
- if (depth.test_enabled != cur_state.depth.test_enabled) {
- if (depth.test_enabled) {
- glEnable(GL_DEPTH_TEST);
- } else {
- glDisable(GL_DEPTH_TEST);
- }
- }
+ Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled);
- if (depth.test_func != cur_state.depth.test_func) {
+ if (cur_state.depth.test_func != depth.test_func) {
+ cur_state.depth.test_func = depth.test_func;
glDepthFunc(depth.test_func);
}
- if (depth.write_mask != cur_state.depth.write_mask) {
+ if (cur_state.depth.write_mask != depth.write_mask) {
+ cur_state.depth.write_mask = depth.write_mask;
glDepthMask(depth.write_mask);
}
}
void OpenGLState::ApplyPrimitiveRestart() const {
- if (primitive_restart.enabled != cur_state.primitive_restart.enabled) {
- if (primitive_restart.enabled) {
- glEnable(GL_PRIMITIVE_RESTART);
- } else {
- glDisable(GL_PRIMITIVE_RESTART);
- }
- }
+ Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled);
- if (primitive_restart.index != cur_state.primitive_restart.index) {
+ if (cur_state.primitive_restart.index != primitive_restart.index) {
+ cur_state.primitive_restart.index = primitive_restart.index;
glPrimitiveRestartIndex(primitive_restart.index);
}
}
void OpenGLState::ApplyStencilTest() const {
- if (stencil.test_enabled != cur_state.stencil.test_enabled) {
- if (stencil.test_enabled) {
- glEnable(GL_STENCIL_TEST);
- } else {
- glDisable(GL_STENCIL_TEST);
- }
- }
-
- const auto ConfigStencil = [](GLenum face, const auto& config, const auto& prev_config) {
- if (config.test_func != prev_config.test_func || config.test_ref != prev_config.test_ref ||
- config.test_mask != prev_config.test_mask) {
+ Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled);
+
+ const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) {
+ if (current.test_func != config.test_func || current.test_ref != config.test_ref ||
+ current.test_mask != config.test_mask) {
+ current.test_func = config.test_func;
+ current.test_ref = config.test_ref;
+ current.test_mask = config.test_mask;
glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask);
}
- if (config.action_depth_fail != prev_config.action_depth_fail ||
- config.action_depth_pass != prev_config.action_depth_pass ||
- config.action_stencil_fail != prev_config.action_stencil_fail) {
+ if (current.action_depth_fail != config.action_depth_fail ||
+ current.action_depth_pass != config.action_depth_pass ||
+ current.action_stencil_fail != config.action_stencil_fail) {
+ current.action_depth_fail = config.action_depth_fail;
+ current.action_depth_pass = config.action_depth_pass;
+ current.action_stencil_fail = config.action_stencil_fail;
glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail,
config.action_depth_pass);
}
- if (config.write_mask != prev_config.write_mask) {
+ if (current.write_mask != config.write_mask) {
+ current.write_mask = config.write_mask;
glStencilMaskSeparate(face, config.write_mask);
}
};
ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front);
ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back);
}
-// Viewport does not affects glClearBuffer so emulate viewport using scissor test
-void OpenGLState::EmulateViewportWithScissor() {
- auto& current = viewports[0];
- if (current.scissor.enabled) {
- const GLint left = std::max(current.x, current.scissor.x);
- const GLint right =
- std::max(current.x + current.width, current.scissor.x + current.scissor.width);
- const GLint bottom = std::max(current.y, current.scissor.y);
- const GLint top =
- std::max(current.y + current.height, current.scissor.y + current.scissor.height);
- current.scissor.x = std::max(left, 0);
- current.scissor.y = std::max(bottom, 0);
- current.scissor.width = std::max(right - left, 0);
- current.scissor.height = std::max(top - bottom, 0);
- } else {
- current.scissor.enabled = true;
- current.scissor.x = current.x;
- current.scissor.y = current.y;
- current.scissor.width = current.width;
- current.scissor.height = current.height;
- }
-}
void OpenGLState::ApplyViewport() const {
- if (geometry_shaders.enabled) {
- for (GLuint i = 0; i < static_cast<GLuint>(Tegra::Engines::Maxwell3D::Regs::NumViewports);
- i++) {
- const auto& current = cur_state.viewports[i];
- const auto& updated = viewports[i];
- if (updated.x != current.x || updated.y != current.y ||
- updated.width != current.width || updated.height != current.height) {
- glViewportIndexedf(
- i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
- static_cast<GLfloat>(updated.width), static_cast<GLfloat>(updated.height));
- }
- if (updated.depth_range_near != current.depth_range_near ||
- updated.depth_range_far != current.depth_range_far) {
- glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
- }
-
- if (updated.scissor.enabled != current.scissor.enabled) {
- if (updated.scissor.enabled) {
- glEnablei(GL_SCISSOR_TEST, i);
- } else {
- glDisablei(GL_SCISSOR_TEST, i);
- }
- }
-
- if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
- updated.scissor.width != current.scissor.width ||
- updated.scissor.height != current.scissor.height) {
- glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
- updated.scissor.height);
- }
- }
- } else {
- const auto& current = cur_state.viewports[0];
- const auto& updated = viewports[0];
- if (updated.x != current.x || updated.y != current.y || updated.width != current.width ||
- updated.height != current.height) {
- glViewport(updated.x, updated.y, updated.width, updated.height);
- }
-
- if (updated.depth_range_near != current.depth_range_near ||
- updated.depth_range_far != current.depth_range_far) {
- glDepthRange(updated.depth_range_near, updated.depth_range_far);
+ for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) {
+ const auto& updated = viewports[i];
+ auto& current = cur_state.viewports[i];
+
+ if (current.x != updated.x || current.y != updated.y || current.width != updated.width ||
+ current.height != updated.height) {
+ current.x = updated.x;
+ current.y = updated.y;
+ current.width = updated.width;
+ current.height = updated.height;
+ glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
+ static_cast<GLfloat>(updated.width),
+ static_cast<GLfloat>(updated.height));
}
-
- if (updated.scissor.enabled != current.scissor.enabled) {
- if (updated.scissor.enabled) {
- glEnable(GL_SCISSOR_TEST);
- } else {
- glDisable(GL_SCISSOR_TEST);
- }
+ if (current.depth_range_near != updated.depth_range_near ||
+ current.depth_range_far != updated.depth_range_far) {
+ current.depth_range_near = updated.depth_range_near;
+ current.depth_range_far = updated.depth_range_far;
+ glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
}
- if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
- updated.scissor.width != current.scissor.width ||
- updated.scissor.height != current.scissor.height) {
- glScissor(updated.scissor.x, updated.scissor.y, updated.scissor.width,
- updated.scissor.height);
+ Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled);
+
+ if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y ||
+ current.scissor.width != updated.scissor.width ||
+ current.scissor.height != updated.scissor.height) {
+ current.scissor.x = updated.scissor.x;
+ current.scissor.y = updated.scissor.y;
+ current.scissor.width = updated.scissor.width;
+ current.scissor.height = updated.scissor.height;
+ glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
+ updated.scissor.height);
}
}
}
void OpenGLState::ApplyGlobalBlending() const {
- const Blend& current = cur_state.blend[0];
const Blend& updated = blend[0];
- if (updated.enabled != current.enabled) {
- if (updated.enabled) {
- glEnable(GL_BLEND);
- } else {
- glDisable(GL_BLEND);
- }
- }
- if (!updated.enabled) {
- return;
- }
- if (updated.src_rgb_func != current.src_rgb_func ||
- updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
- updated.dst_a_func != current.dst_a_func) {
+ Blend& current = cur_state.blend[0];
+
+ Enable(GL_BLEND, current.enabled, updated.enabled);
+
+ if (current.src_rgb_func != updated.src_rgb_func ||
+ current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func ||
+ current.dst_a_func != updated.dst_a_func) {
+ current.src_rgb_func = updated.src_rgb_func;
+ current.dst_rgb_func = updated.dst_rgb_func;
+ current.src_a_func = updated.src_a_func;
+ current.dst_a_func = updated.dst_a_func;
glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
updated.dst_a_func);
}
- if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
+ if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) {
+ current.rgb_equation = updated.rgb_equation;
+ current.a_equation = updated.a_equation;
glBlendEquationSeparate(updated.rgb_equation, updated.a_equation);
}
}
void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
const Blend& updated = blend[target];
- const Blend& current = cur_state.blend[target];
- if (updated.enabled != current.enabled || force) {
- if (updated.enabled) {
- glEnablei(GL_BLEND, static_cast<GLuint>(target));
- } else {
- glDisablei(GL_BLEND, static_cast<GLuint>(target));
- }
+ Blend& current = cur_state.blend[target];
+
+ if (current.enabled != updated.enabled || force) {
+ current.enabled = updated.enabled;
+ Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled);
}
- if (updated.src_rgb_func != current.src_rgb_func ||
- updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
- updated.dst_a_func != current.dst_a_func) {
+ if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func,
+ current.dst_a_func),
+ std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
+ updated.dst_a_func))) {
glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func,
updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func);
}
- if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
+ if (UpdateTie(std::tie(current.rgb_equation, current.a_equation),
+ std::tie(updated.rgb_equation, updated.a_equation))) {
glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation,
updated.a_equation);
}
@@ -364,77 +415,48 @@ void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
void OpenGLState::ApplyBlending() const {
if (independant_blend.enabled) {
- for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
- ApplyTargetBlending(i,
- independant_blend.enabled != cur_state.independant_blend.enabled);
+ const bool force = independant_blend.enabled != cur_state.independant_blend.enabled;
+ for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) {
+ ApplyTargetBlending(target, force);
}
} else {
ApplyGlobalBlending();
}
- if (blend_color.red != cur_state.blend_color.red ||
- blend_color.green != cur_state.blend_color.green ||
- blend_color.blue != cur_state.blend_color.blue ||
- blend_color.alpha != cur_state.blend_color.alpha) {
+ cur_state.independant_blend.enabled = independant_blend.enabled;
+
+ if (UpdateTie(
+ std::tie(cur_state.blend_color.red, cur_state.blend_color.green,
+ cur_state.blend_color.blue, cur_state.blend_color.alpha),
+ std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) {
glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha);
}
}
void OpenGLState::ApplyLogicOp() const {
- if (logic_op.enabled != cur_state.logic_op.enabled) {
- if (logic_op.enabled) {
- glEnable(GL_COLOR_LOGIC_OP);
- } else {
- glDisable(GL_COLOR_LOGIC_OP);
- }
- }
+ Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled);
- if (logic_op.operation != cur_state.logic_op.operation) {
+ if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) {
glLogicOp(logic_op.operation);
}
}
void OpenGLState::ApplyPolygonOffset() const {
- const bool fill_enable_changed =
- polygon_offset.fill_enable != cur_state.polygon_offset.fill_enable;
- const bool line_enable_changed =
- polygon_offset.line_enable != cur_state.polygon_offset.line_enable;
- const bool point_enable_changed =
- polygon_offset.point_enable != cur_state.polygon_offset.point_enable;
- const bool factor_changed = polygon_offset.factor != cur_state.polygon_offset.factor;
- const bool units_changed = polygon_offset.units != cur_state.polygon_offset.units;
- const bool clamp_changed = polygon_offset.clamp != cur_state.polygon_offset.clamp;
-
- if (fill_enable_changed) {
- if (polygon_offset.fill_enable) {
- glEnable(GL_POLYGON_OFFSET_FILL);
- } else {
- glDisable(GL_POLYGON_OFFSET_FILL);
- }
- }
-
- if (line_enable_changed) {
- if (polygon_offset.line_enable) {
- glEnable(GL_POLYGON_OFFSET_LINE);
- } else {
- glDisable(GL_POLYGON_OFFSET_LINE);
- }
- }
-
- if (point_enable_changed) {
- if (polygon_offset.point_enable) {
- glEnable(GL_POLYGON_OFFSET_POINT);
- } else {
- glDisable(GL_POLYGON_OFFSET_POINT);
- }
- }
-
- if (factor_changed || units_changed || clamp_changed) {
+ Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable,
+ polygon_offset.fill_enable);
+ Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable,
+ polygon_offset.line_enable);
+ Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable,
+ polygon_offset.point_enable);
+
+ if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units,
+ cur_state.polygon_offset.clamp),
+ std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) {
if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) {
glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp);
} else {
- glPolygonOffset(polygon_offset.factor, polygon_offset.units);
UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0,
"Unimplemented Depth polygon offset clamp.");
+ glPolygonOffset(polygon_offset.factor, polygon_offset.units);
}
}
}
@@ -443,22 +465,21 @@ void OpenGLState::ApplyTextures() const {
bool has_delta{};
std::size_t first{};
std::size_t last{};
- std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures;
+ std::array<GLuint, Maxwell::NumTextureSamplers> textures;
for (std::size_t i = 0; i < std::size(texture_units); ++i) {
const auto& texture_unit = texture_units[i];
- const auto& cur_state_texture_unit = cur_state.texture_units[i];
+ auto& cur_state_texture_unit = cur_state.texture_units[i];
textures[i] = texture_unit.texture;
-
- if (textures[i] != cur_state_texture_unit.texture) {
- if (!has_delta) {
- first = i;
- has_delta = true;
- }
- last = i;
+ if (cur_state_texture_unit.texture == textures[i])
+ continue;
+ cur_state_texture_unit.texture = textures[i];
+ if (!has_delta) {
+ first = i;
+ has_delta = true;
}
+ last = i;
}
-
if (has_delta) {
glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
textures.data() + first);
@@ -469,16 +490,18 @@ void OpenGLState::ApplySamplers() const {
bool has_delta{};
std::size_t first{};
std::size_t last{};
- std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers;
+ std::array<GLuint, Maxwell::NumTextureSamplers> samplers;
+
for (std::size_t i = 0; i < std::size(samplers); ++i) {
+ if (cur_state.texture_units[i].sampler == texture_units[i].sampler)
+ continue;
+ cur_state.texture_units[i].sampler = texture_units[i].sampler;
samplers[i] = texture_units[i].sampler;
- if (samplers[i] != cur_state.texture_units[i].sampler) {
- if (!has_delta) {
- first = i;
- has_delta = true;
- }
- last = i;
+ if (!has_delta) {
+ first = i;
+ has_delta = true;
}
+ last = i;
}
if (has_delta) {
glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
@@ -486,81 +509,15 @@ void OpenGLState::ApplySamplers() const {
}
}
-void OpenGLState::ApplyFramebufferState() const {
- if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
- glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
- }
- if (draw.draw_framebuffer != cur_state.draw.draw_framebuffer) {
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
- }
-}
-
-void OpenGLState::ApplyVertexArrayState() const {
- if (draw.vertex_array != cur_state.draw.vertex_array) {
- glBindVertexArray(draw.vertex_array);
- }
-}
-
-void OpenGLState::ApplyDepthClamp() const {
- if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
- depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
- return;
- }
- UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
- "Unimplemented Depth Clamp Separation!");
-
- if (depth_clamp.far_plane || depth_clamp.near_plane) {
- glEnable(GL_DEPTH_CLAMP);
- } else {
- glDisable(GL_DEPTH_CLAMP);
- }
-}
-
void OpenGLState::Apply() const {
ApplyFramebufferState();
ApplyVertexArrayState();
-
- // Shader program
- if (draw.shader_program != cur_state.draw.shader_program) {
- glUseProgram(draw.shader_program);
- }
-
- // Program pipeline
- if (draw.program_pipeline != cur_state.draw.program_pipeline) {
- glBindProgramPipeline(draw.program_pipeline);
- }
- // Clip distance
- for (std::size_t i = 0; i < clip_distance.size(); ++i) {
- if (clip_distance[i] != cur_state.clip_distance[i]) {
- if (clip_distance[i]) {
- glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
- } else {
- glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
- }
- }
- }
- // Point
- if (point.size != cur_state.point.size) {
- glPointSize(point.size);
- }
- if (fragment_color_clamp.enabled != cur_state.fragment_color_clamp.enabled) {
- glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
- fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
- }
- if (multisample_control.alpha_to_coverage != cur_state.multisample_control.alpha_to_coverage) {
- if (multisample_control.alpha_to_coverage) {
- glEnable(GL_SAMPLE_ALPHA_TO_COVERAGE);
- } else {
- glDisable(GL_SAMPLE_ALPHA_TO_COVERAGE);
- }
- }
- if (multisample_control.alpha_to_one != cur_state.multisample_control.alpha_to_one) {
- if (multisample_control.alpha_to_one) {
- glEnable(GL_SAMPLE_ALPHA_TO_ONE);
- } else {
- glDisable(GL_SAMPLE_ALPHA_TO_ONE);
- }
- }
+ ApplyShaderProgram();
+ ApplyProgramPipeline();
+ ApplyClipDistances();
+ ApplyPointSize();
+ ApplyFragmentColorClamp();
+ ApplyMultisample();
ApplyDepthClamp();
ApplyColorMask();
ApplyViewport();
@@ -574,7 +531,28 @@ void OpenGLState::Apply() const {
ApplyTextures();
ApplySamplers();
ApplyPolygonOffset();
- cur_state = *this;
+}
+
+void OpenGLState::EmulateViewportWithScissor() {
+ auto& current = viewports[0];
+ if (current.scissor.enabled) {
+ const GLint left = std::max(current.x, current.scissor.x);
+ const GLint right =
+ std::max(current.x + current.width, current.scissor.x + current.scissor.width);
+ const GLint bottom = std::max(current.y, current.scissor.y);
+ const GLint top =
+ std::max(current.y + current.height, current.scissor.y + current.scissor.height);
+ current.scissor.x = std::max(left, 0);
+ current.scissor.y = std::max(bottom, 0);
+ current.scissor.width = std::max(right - left, 0);
+ current.scissor.height = std::max(top - bottom, 0);
+ } else {
+ current.scissor.enabled = true;
+ current.scissor.x = current.x;
+ current.scissor.y = current.y;
+ current.scissor.width = current.width;
+ current.scissor.height = current.height;
+ }
}
OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 9e1eda5b1..41418a7b8 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -54,10 +54,6 @@ public:
} depth_clamp; // GL_DEPTH_CLAMP
struct {
- bool enabled; // viewports arrays are only supported when geometry shaders are enabled.
- } geometry_shaders;
-
- struct {
bool enabled; // GL_CULL_FACE
GLenum mode; // GL_CULL_FACE_MODE
GLenum front_face; // GL_FRONT_FACE
@@ -184,34 +180,26 @@ public:
static OpenGLState GetCurState() {
return cur_state;
}
+
static bool GetsRGBUsed() {
return s_rgb_used;
}
+
static void ClearsRGBUsed() {
s_rgb_used = false;
}
+
/// Apply this state as the current OpenGL state
void Apply() const;
- /// Apply only the state affecting the framebuffer
+
void ApplyFramebufferState() const;
- /// Apply only the state affecting the vertex array
void ApplyVertexArrayState() const;
- /// Set the initial OpenGL state
- static void ApplyDefaultState();
- /// Resets any references to the given resource
- OpenGLState& UnbindTexture(GLuint handle);
- OpenGLState& ResetSampler(GLuint handle);
- OpenGLState& ResetProgram(GLuint handle);
- OpenGLState& ResetPipeline(GLuint handle);
- OpenGLState& ResetVertexArray(GLuint handle);
- OpenGLState& ResetFramebuffer(GLuint handle);
- void EmulateViewportWithScissor();
-
-private:
- static OpenGLState cur_state;
- // Workaround for sRGB problems caused by
- // QT not supporting srgb output
- static bool s_rgb_used;
+ void ApplyShaderProgram() const;
+ void ApplyProgramPipeline() const;
+ void ApplyClipDistances() const;
+ void ApplyPointSize() const;
+ void ApplyFragmentColorClamp() const;
+ void ApplyMultisample() const;
void ApplySRgb() const;
void ApplyCulling() const;
void ApplyColorMask() const;
@@ -227,6 +215,26 @@ private:
void ApplySamplers() const;
void ApplyDepthClamp() const;
void ApplyPolygonOffset() const;
+
+ /// Set the initial OpenGL state
+ static void ApplyDefaultState();
+
+ /// Resets any references to the given resource
+ OpenGLState& UnbindTexture(GLuint handle);
+ OpenGLState& ResetSampler(GLuint handle);
+ OpenGLState& ResetProgram(GLuint handle);
+ OpenGLState& ResetPipeline(GLuint handle);
+ OpenGLState& ResetVertexArray(GLuint handle);
+ OpenGLState& ResetFramebuffer(GLuint handle);
+
+ /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
+ void EmulateViewportWithScissor();
+
+private:
+ static OpenGLState cur_state;
+
+ // Workaround for sRGB problems caused by QT not supporting srgb output
+ static bool s_rgb_used;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index a01efeb05..d69cba9c3 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,7 +5,6 @@
#include <algorithm>
#include <cstddef>
#include <cstdlib>
-#include <cstring>
#include <memory>
#include <glad/glad.h>
#include "common/assert.h"
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index d84634cb3..84a987371 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -5,11 +5,39 @@
#include <string>
#include <fmt/format.h>
#include <glad/glad.h>
+#include "common/assert.h"
#include "common/common_types.h"
#include "video_core/renderer_opengl/utils.h"
namespace OpenGL {
+BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
+
+BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
+
+void BindBuffersRangePushBuffer::Setup(GLuint first_) {
+ first = first_;
+ buffers.clear();
+ offsets.clear();
+ sizes.clear();
+}
+
+void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) {
+ buffers.push_back(buffer);
+ offsets.push_back(offset);
+ sizes.push_back(size);
+}
+
+void BindBuffersRangePushBuffer::Bind() const {
+ const std::size_t count{buffers.size()};
+ DEBUG_ASSERT(count == offsets.size() && count == sizes.size());
+ if (count == 0) {
+ return;
+ }
+ glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(),
+ sizes.data());
+}
+
void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info) {
if (!GLAD_GL_KHR_debug) {
return; // We don't need to throw an error as this is just for debugging
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index 1fcb6fc11..aef45c9dc 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -5,11 +5,31 @@
#pragma once
#include <string>
+#include <vector>
#include <glad/glad.h>
#include "common/common_types.h"
namespace OpenGL {
+class BindBuffersRangePushBuffer {
+public:
+ BindBuffersRangePushBuffer(GLenum target);
+ ~BindBuffersRangePushBuffer();
+
+ void Setup(GLuint first_);
+
+ void Push(GLuint buffer, GLintptr offset, GLsizeiptr size);
+
+ void Bind() const;
+
+private:
+ GLenum target;
+ GLuint first;
+ std::vector<GLuint> buffers;
+ std::vector<GLintptr> offsets;
+ std::vector<GLsizeiptr> sizes;
+};
+
void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info = "");
} // namespace OpenGL \ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 388b5ffd5..02a9f5ecb 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -10,6 +10,7 @@
#include "common/alignment.h"
#include "common/assert.h"
#include "core/memory.h"
+#include "video_core/memory_manager.h"
#include "video_core/renderer_vulkan/declarations.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
new file mode 100644
index 000000000..25500f9a3
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -0,0 +1,1381 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <functional>
+#include <map>
+#include <set>
+
+#include <fmt/format.h>
+
+#include <sirit/sirit.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/engines/shader_header.h"
+#include "video_core/renderer_vulkan/vk_shader_decompiler.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace Vulkan::VKShader {
+
+using Sirit::Id;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::AttributeUse;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
+using Operation = const OperationNode&;
+
+// TODO(Rodrigo): Use rasterizer's value
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 0x1000;
+constexpr u32 STAGE_BINDING_STRIDE = 0x100;
+
+enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
+
+struct SamplerImage {
+ Id image_type;
+ Id sampled_image_type;
+ Id sampler;
+};
+
+namespace {
+
+spv::Dim GetSamplerDim(const Sampler& sampler) {
+ switch (sampler.GetType()) {
+ case Tegra::Shader::TextureType::Texture1D:
+ return spv::Dim::Dim1D;
+ case Tegra::Shader::TextureType::Texture2D:
+ return spv::Dim::Dim2D;
+ case Tegra::Shader::TextureType::Texture3D:
+ return spv::Dim::Dim3D;
+ case Tegra::Shader::TextureType::TextureCube:
+ return spv::Dim::Cube;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<u32>(sampler.GetType()));
+ return spv::Dim::Dim2D;
+ }
+}
+
+/// Returns true if an attribute index is one of the 32 generic attributes
+constexpr bool IsGenericAttribute(Attribute::Index attribute) {
+ return attribute >= Attribute::Index::Attribute_0 &&
+ attribute <= Attribute::Index::Attribute_31;
+}
+
+/// Returns the location of a generic attribute
+constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) {
+ ASSERT(IsGenericAttribute(attribute));
+ return static_cast<u32>(attribute) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+/// Returns true if an object has to be treated as precise
+bool IsPrecise(Operation operand) {
+ const auto& meta = operand.GetMeta();
+
+ if (std::holds_alternative<MetaArithmetic>(meta)) {
+ return std::get<MetaArithmetic>(meta).precise;
+ }
+ if (std::holds_alternative<MetaHalfArithmetic>(meta)) {
+ return std::get<MetaHalfArithmetic>(meta).precise;
+ }
+ return false;
+}
+
+} // namespace
+
+class SPIRVDecompiler : public Sirit::Module {
+public:
+ explicit SPIRVDecompiler(const ShaderIR& ir, ShaderStage stage)
+ : Module(0x00010300), ir{ir}, stage{stage}, header{ir.GetHeader()} {
+ AddCapability(spv::Capability::Shader);
+ AddExtension("SPV_KHR_storage_buffer_storage_class");
+ AddExtension("SPV_KHR_variable_pointers");
+ }
+
+ void Decompile() {
+ AllocateBindings();
+ AllocateLabels();
+
+ DeclareVertex();
+ DeclareGeometry();
+ DeclareFragment();
+ DeclareRegisters();
+ DeclarePredicates();
+ DeclareLocalMemory();
+ DeclareInternalFlags();
+ DeclareInputAttributes();
+ DeclareOutputAttributes();
+ DeclareConstantBuffers();
+ DeclareGlobalBuffers();
+ DeclareSamplers();
+
+ execute_function =
+ Emit(OpFunction(t_void, spv::FunctionControlMask::Inline, TypeFunction(t_void)));
+ Emit(OpLabel());
+
+ const u32 first_address = ir.GetBasicBlocks().begin()->first;
+ const Id loop_label = OpLabel("loop");
+ const Id merge_label = OpLabel("merge");
+ const Id dummy_label = OpLabel();
+ const Id jump_label = OpLabel();
+ continue_label = OpLabel("continue");
+
+ std::vector<Sirit::Literal> literals;
+ std::vector<Id> branch_labels;
+ for (const auto& pair : labels) {
+ const auto [literal, label] = pair;
+ literals.push_back(literal);
+ branch_labels.push_back(label);
+ }
+
+ // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
+ // that shaders will use 20 nested SSYs and PBKs.
+ constexpr u32 FLOW_STACK_SIZE = 20;
+ const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
+ jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
+ spv::StorageClass::Function, Constant(t_uint, first_address)));
+ flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
+ spv::StorageClass::Function, ConstantNull(flow_stack_type)));
+ flow_stack_top =
+ Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
+
+ Name(jmp_to, "jmp_to");
+ Name(flow_stack, "flow_stack");
+ Name(flow_stack_top, "flow_stack_top");
+
+ Emit(OpBranch(loop_label));
+ Emit(loop_label);
+ Emit(OpLoopMerge(merge_label, continue_label, spv::LoopControlMask::Unroll));
+ Emit(OpBranch(dummy_label));
+
+ Emit(dummy_label);
+ const Id default_branch = OpLabel();
+ const Id jmp_to_load = Emit(OpLoad(t_uint, jmp_to));
+ Emit(OpSelectionMerge(jump_label, spv::SelectionControlMask::MaskNone));
+ Emit(OpSwitch(jmp_to_load, default_branch, literals, branch_labels));
+
+ Emit(default_branch);
+ Emit(OpReturn());
+
+ for (const auto& pair : ir.GetBasicBlocks()) {
+ const auto& [address, bb] = pair;
+ Emit(labels.at(address));
+
+ VisitBasicBlock(bb);
+
+ const auto next_it = labels.lower_bound(address + 1);
+ const Id next_label = next_it != labels.end() ? next_it->second : default_branch;
+ Emit(OpBranch(next_label));
+ }
+
+ Emit(jump_label);
+ Emit(OpBranch(continue_label));
+ Emit(continue_label);
+ Emit(OpBranch(loop_label));
+ Emit(merge_label);
+ Emit(OpReturn());
+ Emit(OpFunctionEnd());
+ }
+
+ ShaderEntries GetShaderEntries() const {
+ ShaderEntries entries;
+ entries.const_buffers_base_binding = const_buffers_base_binding;
+ entries.global_buffers_base_binding = global_buffers_base_binding;
+ entries.samplers_base_binding = samplers_base_binding;
+ for (const auto& cbuf : ir.GetConstantBuffers()) {
+ entries.const_buffers.emplace_back(cbuf.second, cbuf.first);
+ }
+ for (const auto& gmem_pair : ir.GetGlobalMemory()) {
+ const auto& [base, usage] = gmem_pair;
+ entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset);
+ }
+ for (const auto& sampler : ir.GetSamplers()) {
+ entries.samplers.emplace_back(sampler);
+ }
+ for (const auto& attr : ir.GetInputAttributes()) {
+ entries.attributes.insert(GetGenericAttributeLocation(attr.first));
+ }
+ entries.clip_distances = ir.GetClipDistances();
+ entries.shader_length = ir.GetLength();
+ entries.entry_function = execute_function;
+ entries.interfaces = interfaces;
+ return entries;
+ }
+
+private:
+ using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
+ using OperationDecompilersArray =
+ std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
+
+ static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);
+ static constexpr u32 CBUF_STRIDE = 16;
+
+ void AllocateBindings() {
+ const u32 binding_base = static_cast<u32>(stage) * STAGE_BINDING_STRIDE;
+ u32 binding_iterator = binding_base;
+
+ const auto Allocate = [&binding_iterator](std::size_t count) {
+ const u32 current_binding = binding_iterator;
+ binding_iterator += static_cast<u32>(count);
+ return current_binding;
+ };
+ const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size());
+ global_buffers_base_binding = Allocate(ir.GetGlobalMemory().size());
+ samplers_base_binding = Allocate(ir.GetSamplers().size());
+
+ ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE,
+ "Stage binding stride is too small");
+ }
+
+ void AllocateLabels() {
+ for (const auto& pair : ir.GetBasicBlocks()) {
+ const u32 address = pair.first;
+ labels.emplace(address, OpLabel(fmt::format("label_0x{:x}", address)));
+ }
+ }
+
+ void DeclareVertex() {
+ if (stage != ShaderStage::Vertex)
+ return;
+
+ DeclareVertexRedeclarations();
+ }
+
+ void DeclareGeometry() {
+ if (stage != ShaderStage::Geometry)
+ return;
+
+ UNIMPLEMENTED();
+ }
+
+ void DeclareFragment() {
+ if (stage != ShaderStage::Fragment)
+ return;
+
+ for (u32 rt = 0; rt < static_cast<u32>(frag_colors.size()); ++rt) {
+ if (!IsRenderTargetUsed(rt)) {
+ continue;
+ }
+
+ const Id id = AddGlobalVariable(OpVariable(t_out_float4, spv::StorageClass::Output));
+ Name(id, fmt::format("frag_color{}", rt));
+ Decorate(id, spv::Decoration::Location, rt);
+
+ frag_colors[rt] = id;
+ interfaces.push_back(id);
+ }
+
+ if (header.ps.omap.depth) {
+ frag_depth = AddGlobalVariable(OpVariable(t_out_float, spv::StorageClass::Output));
+ Name(frag_depth, "frag_depth");
+ Decorate(frag_depth, spv::Decoration::BuiltIn,
+ static_cast<u32>(spv::BuiltIn::FragDepth));
+
+ interfaces.push_back(frag_depth);
+ }
+
+ frag_coord = DeclareBuiltIn(spv::BuiltIn::FragCoord, spv::StorageClass::Input, t_in_float4,
+ "frag_coord");
+ front_facing = DeclareBuiltIn(spv::BuiltIn::FrontFacing, spv::StorageClass::Input,
+ t_in_bool, "front_facing");
+ }
+
+ void DeclareRegisters() {
+ for (const u32 gpr : ir.GetRegisters()) {
+ const Id id = OpVariable(t_prv_float, spv::StorageClass::Private, v_float_zero);
+ Name(id, fmt::format("gpr_{}", gpr));
+ registers.emplace(gpr, AddGlobalVariable(id));
+ }
+ }
+
+ void DeclarePredicates() {
+ for (const auto pred : ir.GetPredicates()) {
+ const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
+ Name(id, fmt::format("pred_{}", static_cast<u32>(pred)));
+ predicates.emplace(pred, AddGlobalVariable(id));
+ }
+ }
+
+ void DeclareLocalMemory() {
+ if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
+ const auto element_count = static_cast<u32>(Common::AlignUp(local_memory_size, 4) / 4);
+ const Id type_array = TypeArray(t_float, Constant(t_uint, element_count));
+ const Id type_pointer = TypePointer(spv::StorageClass::Private, type_array);
+ Name(type_pointer, "LocalMemory");
+
+ local_memory =
+ OpVariable(type_pointer, spv::StorageClass::Private, ConstantNull(type_array));
+ AddGlobalVariable(Name(local_memory, "local_memory"));
+ }
+ }
+
+ void DeclareInternalFlags() {
+ constexpr std::array<const char*, INTERNAL_FLAGS_COUNT> names = {"zero", "sign", "carry",
+ "overflow"};
+ for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) {
+ const auto flag_code = static_cast<InternalFlag>(flag);
+ const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
+ internal_flags[flag] = AddGlobalVariable(Name(id, names[flag]));
+ }
+ }
+
+ void DeclareInputAttributes() {
+ for (const auto element : ir.GetInputAttributes()) {
+ const Attribute::Index index = element.first;
+ if (!IsGenericAttribute(index)) {
+ continue;
+ }
+
+ UNIMPLEMENTED_IF(stage == ShaderStage::Geometry);
+
+ const u32 location = GetGenericAttributeLocation(index);
+ const Id id = OpVariable(t_in_float4, spv::StorageClass::Input);
+ Name(AddGlobalVariable(id), fmt::format("in_attr{}", location));
+ input_attributes.emplace(index, id);
+ interfaces.push_back(id);
+
+ Decorate(id, spv::Decoration::Location, location);
+
+ if (stage != ShaderStage::Fragment) {
+ continue;
+ }
+ switch (header.ps.GetAttributeUse(location)) {
+ case AttributeUse::Constant:
+ Decorate(id, spv::Decoration::Flat);
+ break;
+ case AttributeUse::ScreenLinear:
+ Decorate(id, spv::Decoration::NoPerspective);
+ break;
+ case AttributeUse::Perspective:
+ // Default
+ break;
+ default:
+ UNREACHABLE_MSG("Unused attribute being fetched");
+ }
+ }
+ }
+
+ void DeclareOutputAttributes() {
+ for (const auto index : ir.GetOutputAttributes()) {
+ if (!IsGenericAttribute(index)) {
+ continue;
+ }
+ const auto location = GetGenericAttributeLocation(index);
+ const Id id = OpVariable(t_out_float4, spv::StorageClass::Output);
+ Name(AddGlobalVariable(id), fmt::format("out_attr{}", location));
+ output_attributes.emplace(index, id);
+ interfaces.push_back(id);
+
+ Decorate(id, spv::Decoration::Location, location);
+ }
+ }
+
+ void DeclareConstantBuffers() {
+ u32 binding = const_buffers_base_binding;
+ for (const auto& entry : ir.GetConstantBuffers()) {
+ const auto [index, size] = entry;
+ const Id id = OpVariable(t_cbuf_ubo, spv::StorageClass::Uniform);
+ AddGlobalVariable(Name(id, fmt::format("cbuf_{}", index)));
+
+ Decorate(id, spv::Decoration::Binding, binding++);
+ Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+ constant_buffers.emplace(index, id);
+ }
+ }
+
+ void DeclareGlobalBuffers() {
+ u32 binding = global_buffers_base_binding;
+ for (const auto& entry : ir.GetGlobalMemory()) {
+ const auto [base, usage] = entry;
+ const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer);
+ AddGlobalVariable(
+ Name(id, fmt::format("gmem_{}_{}", base.cbuf_index, base.cbuf_offset)));
+
+ Decorate(id, spv::Decoration::Binding, binding++);
+ Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+ global_buffers.emplace(base, id);
+ }
+ }
+
+ void DeclareSamplers() {
+ u32 binding = samplers_base_binding;
+ for (const auto& sampler : ir.GetSamplers()) {
+ const auto dim = GetSamplerDim(sampler);
+ const int depth = sampler.IsShadow() ? 1 : 0;
+ const int arrayed = sampler.IsArray() ? 1 : 0;
+ // TODO(Rodrigo): Sampled 1 indicates that the image will be used with a sampler. When
+ // SULD and SUST instructions are implemented, replace this value.
+ const int sampled = 1;
+ const Id image_type =
+ TypeImage(t_float, dim, depth, arrayed, false, sampled, spv::ImageFormat::Unknown);
+ const Id sampled_image_type = TypeSampledImage(image_type);
+ const Id pointer_type =
+ TypePointer(spv::StorageClass::UniformConstant, sampled_image_type);
+ const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+ AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex())));
+
+ sampler_images.insert(
+ {static_cast<u32>(sampler.GetIndex()), {image_type, sampled_image_type, id}});
+
+ Decorate(id, spv::Decoration::Binding, binding++);
+ Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+ }
+ }
+
+ void DeclareVertexRedeclarations() {
+ vertex_index = DeclareBuiltIn(spv::BuiltIn::VertexIndex, spv::StorageClass::Input,
+ t_in_uint, "vertex_index");
+ instance_index = DeclareBuiltIn(spv::BuiltIn::InstanceIndex, spv::StorageClass::Input,
+ t_in_uint, "instance_index");
+
+ bool is_point_size_declared = false;
+ bool is_clip_distances_declared = false;
+ for (const auto index : ir.GetOutputAttributes()) {
+ if (index == Attribute::Index::PointSize) {
+ is_point_size_declared = true;
+ } else if (index == Attribute::Index::ClipDistances0123 ||
+ index == Attribute::Index::ClipDistances4567) {
+ is_clip_distances_declared = true;
+ }
+ }
+
+ std::vector<Id> members;
+ members.push_back(t_float4);
+ if (is_point_size_declared) {
+ members.push_back(t_float);
+ }
+ if (is_clip_distances_declared) {
+ members.push_back(TypeArray(t_float, Constant(t_uint, 8)));
+ }
+
+ const Id gl_per_vertex_struct = Name(TypeStruct(members), "PerVertex");
+ Decorate(gl_per_vertex_struct, spv::Decoration::Block);
+
+ u32 declaration_index = 0;
+ const auto MemberDecorateBuiltIn = [&](spv::BuiltIn builtin, std::string name,
+ bool condition) {
+ if (!condition)
+ return u32{};
+ MemberName(gl_per_vertex_struct, declaration_index, name);
+ MemberDecorate(gl_per_vertex_struct, declaration_index, spv::Decoration::BuiltIn,
+ static_cast<u32>(builtin));
+ return declaration_index++;
+ };
+
+ position_index = MemberDecorateBuiltIn(spv::BuiltIn::Position, "position", true);
+ point_size_index =
+ MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", is_point_size_declared);
+ clip_distances_index = MemberDecorateBuiltIn(spv::BuiltIn::ClipDistance, "clip_distances",
+ is_clip_distances_declared);
+
+ const Id type_pointer = TypePointer(spv::StorageClass::Output, gl_per_vertex_struct);
+ per_vertex = OpVariable(type_pointer, spv::StorageClass::Output);
+ AddGlobalVariable(Name(per_vertex, "per_vertex"));
+ interfaces.push_back(per_vertex);
+ }
+
+ void VisitBasicBlock(const NodeBlock& bb) {
+ for (const Node node : bb) {
+ static_cast<void>(Visit(node));
+ }
+ }
+
+ Id Visit(Node node) {
+ if (const auto operation = std::get_if<OperationNode>(node)) {
+ const auto operation_index = static_cast<std::size_t>(operation->GetCode());
+ const auto decompiler = operation_decompilers[operation_index];
+ if (decompiler == nullptr) {
+ UNREACHABLE_MSG("Operation decompiler {} not defined", operation_index);
+ }
+ return (this->*decompiler)(*operation);
+
+ } else if (const auto gpr = std::get_if<GprNode>(node)) {
+ const u32 index = gpr->GetIndex();
+ if (index == Register::ZeroIndex) {
+ return Constant(t_float, 0.0f);
+ }
+ return Emit(OpLoad(t_float, registers.at(index)));
+
+ } else if (const auto immediate = std::get_if<ImmediateNode>(node)) {
+ return BitcastTo<Type::Float>(Constant(t_uint, immediate->GetValue()));
+
+ } else if (const auto predicate = std::get_if<PredicateNode>(node)) {
+ const auto value = [&]() -> Id {
+ switch (const auto index = predicate->GetIndex(); index) {
+ case Tegra::Shader::Pred::UnusedIndex:
+ return v_true;
+ case Tegra::Shader::Pred::NeverExecute:
+ return v_false;
+ default:
+ return Emit(OpLoad(t_bool, predicates.at(index)));
+ }
+ }();
+ if (predicate->IsNegated()) {
+ return Emit(OpLogicalNot(t_bool, value));
+ }
+ return value;
+
+ } else if (const auto abuf = std::get_if<AbufNode>(node)) {
+ const auto attribute = abuf->GetIndex();
+ const auto element = abuf->GetElement();
+
+ switch (attribute) {
+ case Attribute::Index::Position:
+ if (stage != ShaderStage::Fragment) {
+ UNIMPLEMENTED();
+ break;
+ } else {
+ if (element == 3) {
+ return Constant(t_float, 1.0f);
+ }
+ return Emit(OpLoad(t_float, AccessElement(t_in_float, frag_coord, element)));
+ }
+ case Attribute::Index::TessCoordInstanceIDVertexID:
+ // TODO(Subv): Find out what the values are for the first two elements when inside a
+ // vertex shader, and what's the value of the fourth element when inside a Tess Eval
+ // shader.
+ ASSERT(stage == ShaderStage::Vertex);
+ switch (element) {
+ case 2:
+ return BitcastFrom<Type::Uint>(Emit(OpLoad(t_uint, instance_index)));
+ case 3:
+ return BitcastFrom<Type::Uint>(Emit(OpLoad(t_uint, vertex_index)));
+ }
+ UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+ return Constant(t_float, 0);
+ case Attribute::Index::FrontFacing:
+ // TODO(Subv): Find out what the values are for the other elements.
+ ASSERT(stage == ShaderStage::Fragment);
+ if (element == 3) {
+ const Id is_front_facing = Emit(OpLoad(t_bool, front_facing));
+ const Id true_value =
+ BitcastTo<Type::Float>(Constant(t_int, static_cast<s32>(-1)));
+ const Id false_value = BitcastTo<Type::Float>(Constant(t_int, 0));
+ return Emit(OpSelect(t_float, is_front_facing, true_value, false_value));
+ }
+ UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
+ return Constant(t_float, 0.0f);
+ default:
+ if (IsGenericAttribute(attribute)) {
+ const Id pointer =
+ AccessElement(t_in_float, input_attributes.at(attribute), element);
+ return Emit(OpLoad(t_float, pointer));
+ }
+ break;
+ }
+ UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
+
+ } else if (const auto cbuf = std::get_if<CbufNode>(node)) {
+ const Node offset = cbuf->GetOffset();
+ const Id buffer_id = constant_buffers.at(cbuf->GetIndex());
+
+ Id buffer_index{};
+ Id buffer_element{};
+
+ if (const auto immediate = std::get_if<ImmediateNode>(offset)) {
+ // Direct access
+ const u32 offset_imm = immediate->GetValue();
+ ASSERT(offset_imm % 4 == 0);
+ buffer_index = Constant(t_uint, offset_imm / 16);
+ buffer_element = Constant(t_uint, (offset_imm / 4) % 4);
+
+ } else if (std::holds_alternative<OperationNode>(*offset)) {
+ // Indirect access
+ // TODO(Rodrigo): Use a uniform buffer stride of 4 and drop this slow math (which
+ // emits sub-optimal code on GLSL from my testing).
+ const Id offset_id = BitcastTo<Type::Uint>(Visit(offset));
+ const Id unsafe_offset = Emit(OpUDiv(t_uint, offset_id, Constant(t_uint, 4)));
+ const Id final_offset = Emit(
+ OpUMod(t_uint, unsafe_offset, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS - 1)));
+ buffer_index = Emit(OpUDiv(t_uint, final_offset, Constant(t_uint, 4)));
+ buffer_element = Emit(OpUMod(t_uint, final_offset, Constant(t_uint, 4)));
+
+ } else {
+ UNREACHABLE_MSG("Unmanaged offset node type");
+ }
+
+ const Id pointer = Emit(OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0),
+ buffer_index, buffer_element));
+ return Emit(OpLoad(t_float, pointer));
+
+ } else if (const auto gmem = std::get_if<GmemNode>(node)) {
+ const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor());
+ const Id real = BitcastTo<Type::Uint>(Visit(gmem->GetRealAddress()));
+ const Id base = BitcastTo<Type::Uint>(Visit(gmem->GetBaseAddress()));
+
+ Id offset = Emit(OpISub(t_uint, real, base));
+ offset = Emit(OpUDiv(t_uint, offset, Constant(t_uint, 4u)));
+ return Emit(OpLoad(t_float, Emit(OpAccessChain(t_gmem_float, gmem_buffer,
+ Constant(t_uint, 0u), offset))));
+
+ } else if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+ // It's invalid to call conditional on nested nodes, use an operation instead
+ const Id true_label = OpLabel();
+ const Id skip_label = OpLabel();
+ Emit(OpBranchConditional(Visit(conditional->GetCondition()), true_label, skip_label));
+ Emit(true_label);
+
+ VisitBasicBlock(conditional->GetCode());
+
+ Emit(OpBranch(skip_label));
+ Emit(skip_label);
+ return {};
+
+ } else if (const auto comment = std::get_if<CommentNode>(node)) {
+ Name(Emit(OpUndef(t_void)), comment->GetText());
+ return {};
+ }
+
+ UNREACHABLE();
+ return {};
+ }
+
+ template <Id (Module::*func)(Id, Id), Type result_type, Type type_a = result_type>
+ Id Unary(Operation operation) {
+ const Id type_def = GetTypeDefinition(result_type);
+ const Id op_a = VisitOperand<type_a>(operation, 0);
+
+ const Id value = BitcastFrom<result_type>(Emit((this->*func)(type_def, op_a)));
+ if (IsPrecise(operation)) {
+ Decorate(value, spv::Decoration::NoContraction);
+ }
+ return value;
+ }
+
+ template <Id (Module::*func)(Id, Id, Id), Type result_type, Type type_a = result_type,
+ Type type_b = type_a>
+ Id Binary(Operation operation) {
+ const Id type_def = GetTypeDefinition(result_type);
+ const Id op_a = VisitOperand<type_a>(operation, 0);
+ const Id op_b = VisitOperand<type_b>(operation, 1);
+
+ const Id value = BitcastFrom<result_type>(Emit((this->*func)(type_def, op_a, op_b)));
+ if (IsPrecise(operation)) {
+ Decorate(value, spv::Decoration::NoContraction);
+ }
+ return value;
+ }
+
+ template <Id (Module::*func)(Id, Id, Id, Id), Type result_type, Type type_a = result_type,
+ Type type_b = type_a, Type type_c = type_b>
+ Id Ternary(Operation operation) {
+ const Id type_def = GetTypeDefinition(result_type);
+ const Id op_a = VisitOperand<type_a>(operation, 0);
+ const Id op_b = VisitOperand<type_b>(operation, 1);
+ const Id op_c = VisitOperand<type_c>(operation, 2);
+
+ const Id value = BitcastFrom<result_type>(Emit((this->*func)(type_def, op_a, op_b, op_c)));
+ if (IsPrecise(operation)) {
+ Decorate(value, spv::Decoration::NoContraction);
+ }
+ return value;
+ }
+
+ template <Id (Module::*func)(Id, Id, Id, Id, Id), Type result_type, Type type_a = result_type,
+ Type type_b = type_a, Type type_c = type_b, Type type_d = type_c>
+ Id Quaternary(Operation operation) {
+ const Id type_def = GetTypeDefinition(result_type);
+ const Id op_a = VisitOperand<type_a>(operation, 0);
+ const Id op_b = VisitOperand<type_b>(operation, 1);
+ const Id op_c = VisitOperand<type_c>(operation, 2);
+ const Id op_d = VisitOperand<type_d>(operation, 3);
+
+ const Id value =
+ BitcastFrom<result_type>(Emit((this->*func)(type_def, op_a, op_b, op_c, op_d)));
+ if (IsPrecise(operation)) {
+ Decorate(value, spv::Decoration::NoContraction);
+ }
+ return value;
+ }
+
+ Id Assign(Operation operation) {
+ const Node dest = operation[0];
+ const Node src = operation[1];
+
+ Id target{};
+ if (const auto gpr = std::get_if<GprNode>(dest)) {
+ if (gpr->GetIndex() == Register::ZeroIndex) {
+ // Writing to Register::ZeroIndex is a no op
+ return {};
+ }
+ target = registers.at(gpr->GetIndex());
+
+ } else if (const auto abuf = std::get_if<AbufNode>(dest)) {
+ target = [&]() -> Id {
+ switch (const auto attribute = abuf->GetIndex(); attribute) {
+ case Attribute::Index::Position:
+ return AccessElement(t_out_float, per_vertex, position_index,
+ abuf->GetElement());
+ case Attribute::Index::PointSize:
+ return AccessElement(t_out_float, per_vertex, point_size_index);
+ case Attribute::Index::ClipDistances0123:
+ return AccessElement(t_out_float, per_vertex, clip_distances_index,
+ abuf->GetElement());
+ case Attribute::Index::ClipDistances4567:
+ return AccessElement(t_out_float, per_vertex, clip_distances_index,
+ abuf->GetElement() + 4);
+ default:
+ if (IsGenericAttribute(attribute)) {
+ return AccessElement(t_out_float, output_attributes.at(attribute),
+ abuf->GetElement());
+ }
+ UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
+ static_cast<u32>(attribute));
+ return {};
+ }
+ }();
+
+ } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
+ Id address = BitcastTo<Type::Uint>(Visit(lmem->GetAddress()));
+ address = Emit(OpUDiv(t_uint, address, Constant(t_uint, 4)));
+ target = Emit(OpAccessChain(t_prv_float, local_memory, {address}));
+ }
+
+ Emit(OpStore(target, Visit(src)));
+ return {};
+ }
+
+ Id HNegate(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id HMergeF32(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id HMergeH0(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id HMergeH1(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id HPack2(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id LogicalAssign(Operation operation) {
+ const Node dest = operation[0];
+ const Node src = operation[1];
+
+ Id target{};
+ if (const auto pred = std::get_if<PredicateNode>(dest)) {
+ ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+ const auto index = pred->GetIndex();
+ switch (index) {
+ case Tegra::Shader::Pred::NeverExecute:
+ case Tegra::Shader::Pred::UnusedIndex:
+ // Writing to these predicates is a no-op
+ return {};
+ }
+ target = predicates.at(index);
+
+ } else if (const auto flag = std::get_if<InternalFlagNode>(dest)) {
+ target = internal_flags.at(static_cast<u32>(flag->GetFlag()));
+ }
+
+ Emit(OpStore(target, Visit(src)));
+ return {};
+ }
+
+ Id LogicalPick2(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id LogicalAll2(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id LogicalAny2(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id GetTextureSampler(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ const auto entry = sampler_images.at(static_cast<u32>(meta->sampler.GetIndex()));
+ return Emit(OpLoad(entry.sampled_image_type, entry.sampler));
+ }
+
+ Id GetTextureImage(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ const auto entry = sampler_images.at(static_cast<u32>(meta->sampler.GetIndex()));
+ return Emit(OpImage(entry.image_type, GetTextureSampler(operation)));
+ }
+
+ Id GetTextureCoordinates(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ std::vector<Id> coords;
+ for (std::size_t i = 0; i < operation.GetOperandsCount(); ++i) {
+ coords.push_back(Visit(operation[i]));
+ }
+ if (meta->sampler.IsArray()) {
+ const Id array_integer = BitcastTo<Type::Int>(Visit(meta->array));
+ coords.push_back(Emit(OpConvertSToF(t_float, array_integer)));
+ }
+ if (meta->sampler.IsShadow()) {
+ coords.push_back(Visit(meta->depth_compare));
+ }
+
+ const std::array<Id, 4> t_float_lut = {nullptr, t_float2, t_float3, t_float4};
+ return coords.size() == 1
+ ? coords[0]
+ : Emit(OpCompositeConstruct(t_float_lut.at(coords.size() - 1), coords));
+ }
+
+ Id GetTextureElement(Operation operation, Id sample_value) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ ASSERT(meta);
+ return Emit(OpCompositeExtract(t_float, sample_value, meta->element));
+ }
+
+ Id Texture(Operation operation) {
+ const Id texture = Emit(OpImageSampleImplicitLod(t_float4, GetTextureSampler(operation),
+ GetTextureCoordinates(operation)));
+ return GetTextureElement(operation, texture);
+ }
+
+ Id TextureLod(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ const Id texture = Emit(OpImageSampleExplicitLod(
+ t_float4, GetTextureSampler(operation), GetTextureCoordinates(operation),
+ spv::ImageOperandsMask::Lod, Visit(meta->lod)));
+ return GetTextureElement(operation, texture);
+ }
+
+ Id TextureGather(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ const auto coords = GetTextureCoordinates(operation);
+
+ Id texture;
+ if (meta->sampler.IsShadow()) {
+ texture = Emit(OpImageDrefGather(t_float4, GetTextureSampler(operation), coords,
+ Visit(meta->component)));
+ } else {
+ u32 component_value = 0;
+ if (meta->component) {
+ const auto component = std::get_if<ImmediateNode>(meta->component);
+ ASSERT_MSG(component, "Component is not an immediate value");
+ component_value = component->GetValue();
+ }
+ texture = Emit(OpImageGather(t_float4, GetTextureSampler(operation), coords,
+ Constant(t_uint, component_value)));
+ }
+
+ return GetTextureElement(operation, texture);
+ }
+
+ Id TextureQueryDimensions(Operation operation) {
+ const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
+ const auto image_id = GetTextureImage(operation);
+ AddCapability(spv::Capability::ImageQuery);
+
+ if (meta->element == 3) {
+ return BitcastTo<Type::Float>(Emit(OpImageQueryLevels(t_int, image_id)));
+ }
+
+ const Id lod = VisitOperand<Type::Uint>(operation, 0);
+ const std::size_t coords_count = [&]() {
+ switch (const auto type = meta->sampler.GetType(); type) {
+ case Tegra::Shader::TextureType::Texture1D:
+ return 1;
+ case Tegra::Shader::TextureType::Texture2D:
+ case Tegra::Shader::TextureType::TextureCube:
+ return 2;
+ case Tegra::Shader::TextureType::Texture3D:
+ return 3;
+ default:
+ UNREACHABLE_MSG("Invalid texture type={}", static_cast<u32>(type));
+ return 2;
+ }
+ }();
+
+ if (meta->element >= coords_count) {
+ return Constant(t_float, 0.0f);
+ }
+
+ const std::array<Id, 3> types = {t_int, t_int2, t_int3};
+ const Id sizes = Emit(OpImageQuerySizeLod(types.at(coords_count - 1), image_id, lod));
+ const Id size = Emit(OpCompositeExtract(t_int, sizes, meta->element));
+ return BitcastTo<Type::Float>(size);
+ }
+
+ Id TextureQueryLod(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id TexelFetch(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id Branch(Operation operation) {
+ const auto target = std::get_if<ImmediateNode>(operation[0]);
+ UNIMPLEMENTED_IF(!target);
+
+ Emit(OpStore(jmp_to, Constant(t_uint, target->GetValue())));
+ BranchingOp([&]() { Emit(OpBranch(continue_label)); });
+ return {};
+ }
+
+ Id PushFlowStack(Operation operation) {
+ const auto target = std::get_if<ImmediateNode>(operation[0]);
+ ASSERT(target);
+
+ const Id current = Emit(OpLoad(t_uint, flow_stack_top));
+ const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
+ const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
+
+ Emit(OpStore(access, Constant(t_uint, target->GetValue())));
+ Emit(OpStore(flow_stack_top, next));
+ return {};
+ }
+
+ Id PopFlowStack(Operation operation) {
+ const Id current = Emit(OpLoad(t_uint, flow_stack_top));
+ const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
+ const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
+ const Id target = Emit(OpLoad(t_uint, access));
+
+ Emit(OpStore(flow_stack_top, previous));
+ Emit(OpStore(jmp_to, target));
+ BranchingOp([&]() { Emit(OpBranch(continue_label)); });
+ return {};
+ }
+
+ Id Exit(Operation operation) {
+ switch (stage) {
+ case ShaderStage::Vertex: {
+ // TODO(Rodrigo): We should use VK_EXT_depth_range_unrestricted instead, but it doesn't
+ // seem to be working on Nvidia's drivers and Intel (mesa and blob) doesn't support it.
+ const Id position = AccessElement(t_float4, per_vertex, position_index);
+ Id depth = Emit(OpLoad(t_float, AccessElement(t_out_float, position, 2)));
+ depth = Emit(OpFAdd(t_float, depth, Constant(t_float, 1.0f)));
+ depth = Emit(OpFMul(t_float, depth, Constant(t_float, 0.5f)));
+ Emit(OpStore(AccessElement(t_out_float, position, 2), depth));
+ break;
+ }
+ case ShaderStage::Fragment: {
+ const auto SafeGetRegister = [&](u32 reg) {
+ // TODO(Rodrigo): Replace with contains once C++20 releases
+ if (const auto it = registers.find(reg); it != registers.end()) {
+ return Emit(OpLoad(t_float, it->second));
+ }
+ return Constant(t_float, 0.0f);
+ };
+
+ UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0,
+ "Sample mask write is unimplemented");
+
+ // TODO(Rodrigo): Alpha testing
+
+ // Write the color outputs using the data in the shader registers, disabled
+ // rendertargets/components are skipped in the register assignment.
+ u32 current_reg = 0;
+ for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
+ // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
+ for (u32 component = 0; component < 4; ++component) {
+ if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
+ Emit(OpStore(AccessElement(t_out_float, frag_colors.at(rt), component),
+ SafeGetRegister(current_reg)));
+ ++current_reg;
+ }
+ }
+ }
+ if (header.ps.omap.depth) {
+ // The depth output is always 2 registers after the last color output, and
+ // current_reg already contains one past the last color register.
+ Emit(OpStore(frag_depth, SafeGetRegister(current_reg + 1)));
+ }
+ break;
+ }
+ }
+
+ BranchingOp([&]() { Emit(OpReturn()); });
+ return {};
+ }
+
+ Id Discard(Operation operation) {
+ BranchingOp([&]() { Emit(OpKill()); });
+ return {};
+ }
+
+ Id EmitVertex(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id EndPrimitive(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id YNegate(Operation operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
+ const std::string& name) {
+ const Id id = OpVariable(type, storage);
+ Decorate(id, spv::Decoration::BuiltIn, static_cast<u32>(builtin));
+ AddGlobalVariable(Name(id, name));
+ interfaces.push_back(id);
+ return id;
+ }
+
+ bool IsRenderTargetUsed(u32 rt) const {
+ for (u32 component = 0; component < 4; ++component) {
+ if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ template <typename... Args>
+ Id AccessElement(Id pointer_type, Id composite, Args... elements_) {
+ std::vector<Id> members;
+ auto elements = {elements_...};
+ for (const auto element : elements) {
+ members.push_back(Constant(t_uint, element));
+ }
+
+ return Emit(OpAccessChain(pointer_type, composite, members));
+ }
+
+ template <Type type>
+ Id VisitOperand(Operation operation, std::size_t operand_index) {
+ const Id value = Visit(operation[operand_index]);
+
+ switch (type) {
+ case Type::Bool:
+ case Type::Bool2:
+ case Type::Float:
+ return value;
+ case Type::Int:
+ return Emit(OpBitcast(t_int, value));
+ case Type::Uint:
+ return Emit(OpBitcast(t_uint, value));
+ case Type::HalfFloat:
+ UNIMPLEMENTED();
+ }
+ UNREACHABLE();
+ return value;
+ }
+
+ template <Type type>
+ Id BitcastFrom(Id value) {
+ switch (type) {
+ case Type::Bool:
+ case Type::Bool2:
+ case Type::Float:
+ return value;
+ case Type::Int:
+ case Type::Uint:
+ return Emit(OpBitcast(t_float, value));
+ case Type::HalfFloat:
+ UNIMPLEMENTED();
+ }
+ UNREACHABLE();
+ return value;
+ }
+
+ template <Type type>
+ Id BitcastTo(Id value) {
+ switch (type) {
+ case Type::Bool:
+ case Type::Bool2:
+ UNREACHABLE();
+ case Type::Float:
+ return Emit(OpBitcast(t_float, value));
+ case Type::Int:
+ return Emit(OpBitcast(t_int, value));
+ case Type::Uint:
+ return Emit(OpBitcast(t_uint, value));
+ case Type::HalfFloat:
+ UNIMPLEMENTED();
+ }
+ UNREACHABLE();
+ return value;
+ }
+
+ Id GetTypeDefinition(Type type) {
+ switch (type) {
+ case Type::Bool:
+ return t_bool;
+ case Type::Bool2:
+ return t_bool2;
+ case Type::Float:
+ return t_float;
+ case Type::Int:
+ return t_int;
+ case Type::Uint:
+ return t_uint;
+ case Type::HalfFloat:
+ UNIMPLEMENTED();
+ }
+ UNREACHABLE();
+ return {};
+ }
+
+ void BranchingOp(std::function<void()> call) {
+ const Id true_label = OpLabel();
+ const Id skip_label = OpLabel();
+ Emit(OpSelectionMerge(skip_label, spv::SelectionControlMask::Flatten));
+ Emit(OpBranchConditional(v_true, true_label, skip_label, 1, 0));
+ Emit(true_label);
+ call();
+
+ Emit(skip_label);
+ }
+
+ static constexpr OperationDecompilersArray operation_decompilers = {
+ &SPIRVDecompiler::Assign,
+
+ &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
+ Type::Float>,
+
+ &SPIRVDecompiler::Binary<&Module::OpFAdd, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFMul, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFDiv, Type::Float>,
+ &SPIRVDecompiler::Ternary<&Module::OpFma, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpFNegate, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::Float>,
+ &SPIRVDecompiler::Ternary<&Module::OpFClamp, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFMin, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFMax, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpCos, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpSin, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpExp2, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpLog2, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpInverseSqrt, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpSqrt, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpRoundEven, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpFloor, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpCeil, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpTrunc, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpConvertSToF, Type::Float, Type::Int>,
+ &SPIRVDecompiler::Unary<&Module::OpConvertUToF, Type::Float, Type::Uint>,
+
+ &SPIRVDecompiler::Binary<&Module::OpIAdd, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpIMul, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSDiv, Type::Int>,
+ &SPIRVDecompiler::Unary<&Module::OpSNegate, Type::Int>,
+ &SPIRVDecompiler::Unary<&Module::OpSAbs, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSMin, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSMax, Type::Int>,
+
+ &SPIRVDecompiler::Unary<&Module::OpConvertFToS, Type::Int, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpBitcast, Type::Int, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftLeftLogical, Type::Int, Type::Int, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftRightLogical, Type::Int, Type::Int, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftRightArithmetic, Type::Int, Type::Int, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseAnd, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseOr, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseXor, Type::Int>,
+ &SPIRVDecompiler::Unary<&Module::OpNot, Type::Int>,
+ &SPIRVDecompiler::Quaternary<&Module::OpBitFieldInsert, Type::Int>,
+ &SPIRVDecompiler::Ternary<&Module::OpBitFieldSExtract, Type::Int>,
+ &SPIRVDecompiler::Unary<&Module::OpBitCount, Type::Int>,
+
+ &SPIRVDecompiler::Binary<&Module::OpIAdd, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpIMul, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpUDiv, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpUMin, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpUMax, Type::Uint>,
+ &SPIRVDecompiler::Unary<&Module::OpConvertFToU, Type::Uint, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpBitcast, Type::Uint, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftLeftLogical, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftRightLogical, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpShiftRightArithmetic, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseAnd, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseOr, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpBitwiseXor, Type::Uint>,
+ &SPIRVDecompiler::Unary<&Module::OpNot, Type::Uint>,
+ &SPIRVDecompiler::Quaternary<&Module::OpBitFieldInsert, Type::Uint>,
+ &SPIRVDecompiler::Ternary<&Module::OpBitFieldUExtract, Type::Uint>,
+ &SPIRVDecompiler::Unary<&Module::OpBitCount, Type::Uint>,
+
+ &SPIRVDecompiler::Binary<&Module::OpFAdd, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFMul, Type::HalfFloat>,
+ &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>,
+ &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
+ &SPIRVDecompiler::HNegate,
+ &SPIRVDecompiler::HMergeF32,
+ &SPIRVDecompiler::HMergeH0,
+ &SPIRVDecompiler::HMergeH1,
+ &SPIRVDecompiler::HPack2,
+
+ &SPIRVDecompiler::LogicalAssign,
+ &SPIRVDecompiler::Binary<&Module::OpLogicalAnd, Type::Bool>,
+ &SPIRVDecompiler::Binary<&Module::OpLogicalOr, Type::Bool>,
+ &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
+ &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
+ &SPIRVDecompiler::LogicalPick2,
+ &SPIRVDecompiler::LogicalAll2,
+ &SPIRVDecompiler::LogicalAny2,
+
+ &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::Float>,
+ &SPIRVDecompiler::Unary<&Module::OpIsNan, Type::Bool>,
+
+ &SPIRVDecompiler::Binary<&Module::OpSLessThan, Type::Bool, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpIEqual, Type::Bool, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSLessThanEqual, Type::Bool, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSGreaterThan, Type::Bool, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpINotEqual, Type::Bool, Type::Int>,
+ &SPIRVDecompiler::Binary<&Module::OpSGreaterThanEqual, Type::Bool, Type::Int>,
+
+ &SPIRVDecompiler::Binary<&Module::OpULessThan, Type::Bool, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpIEqual, Type::Bool, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpULessThanEqual, Type::Bool, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpUGreaterThan, Type::Bool, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpINotEqual, Type::Bool, Type::Uint>,
+ &SPIRVDecompiler::Binary<&Module::OpUGreaterThanEqual, Type::Bool, Type::Uint>,
+
+ &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
+ &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
+
+ &SPIRVDecompiler::Texture,
+ &SPIRVDecompiler::TextureLod,
+ &SPIRVDecompiler::TextureGather,
+ &SPIRVDecompiler::TextureQueryDimensions,
+ &SPIRVDecompiler::TextureQueryLod,
+ &SPIRVDecompiler::TexelFetch,
+
+ &SPIRVDecompiler::Branch,
+ &SPIRVDecompiler::PushFlowStack,
+ &SPIRVDecompiler::PopFlowStack,
+ &SPIRVDecompiler::Exit,
+ &SPIRVDecompiler::Discard,
+
+ &SPIRVDecompiler::EmitVertex,
+ &SPIRVDecompiler::EndPrimitive,
+
+ &SPIRVDecompiler::YNegate,
+ };
+
+ const ShaderIR& ir;
+ const ShaderStage stage;
+ const Tegra::Shader::Header header;
+
+ const Id t_void = Name(TypeVoid(), "void");
+
+ const Id t_bool = Name(TypeBool(), "bool");
+ const Id t_bool2 = Name(TypeVector(t_bool, 2), "bool2");
+
+ const Id t_int = Name(TypeInt(32, true), "int");
+ const Id t_int2 = Name(TypeVector(t_int, 2), "int2");
+ const Id t_int3 = Name(TypeVector(t_int, 3), "int3");
+ const Id t_int4 = Name(TypeVector(t_int, 4), "int4");
+
+ const Id t_uint = Name(TypeInt(32, false), "uint");
+ const Id t_uint2 = Name(TypeVector(t_uint, 2), "uint2");
+ const Id t_uint3 = Name(TypeVector(t_uint, 3), "uint3");
+ const Id t_uint4 = Name(TypeVector(t_uint, 4), "uint4");
+
+ const Id t_float = Name(TypeFloat(32), "float");
+ const Id t_float2 = Name(TypeVector(t_float, 2), "float2");
+ const Id t_float3 = Name(TypeVector(t_float, 3), "float3");
+ const Id t_float4 = Name(TypeVector(t_float, 4), "float4");
+
+ const Id t_prv_bool = Name(TypePointer(spv::StorageClass::Private, t_bool), "prv_bool");
+ const Id t_prv_float = Name(TypePointer(spv::StorageClass::Private, t_float), "prv_float");
+
+ const Id t_func_uint = Name(TypePointer(spv::StorageClass::Function, t_uint), "func_uint");
+
+ const Id t_in_bool = Name(TypePointer(spv::StorageClass::Input, t_bool), "in_bool");
+ const Id t_in_uint = Name(TypePointer(spv::StorageClass::Input, t_uint), "in_uint");
+ const Id t_in_float = Name(TypePointer(spv::StorageClass::Input, t_float), "in_float");
+ const Id t_in_float4 = Name(TypePointer(spv::StorageClass::Input, t_float4), "in_float4");
+
+ const Id t_out_float = Name(TypePointer(spv::StorageClass::Output, t_float), "out_float");
+ const Id t_out_float4 = Name(TypePointer(spv::StorageClass::Output, t_float4), "out_float4");
+
+ const Id t_cbuf_float = TypePointer(spv::StorageClass::Uniform, t_float);
+ const Id t_cbuf_array =
+ Decorate(Name(TypeArray(t_float4, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS)), "CbufArray"),
+ spv::Decoration::ArrayStride, CBUF_STRIDE);
+ const Id t_cbuf_struct = MemberDecorate(
+ Decorate(TypeStruct(t_cbuf_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
+ const Id t_cbuf_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_struct);
+
+ const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float);
+ const Id t_gmem_array =
+ Name(Decorate(TypeRuntimeArray(t_float), spv::Decoration::ArrayStride, 4u), "GmemArray");
+ const Id t_gmem_struct = MemberDecorate(
+ Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
+ const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
+
+ const Id v_float_zero = Constant(t_float, 0.0f);
+ const Id v_true = ConstantTrue(t_bool);
+ const Id v_false = ConstantFalse(t_bool);
+
+ Id per_vertex{};
+ std::map<u32, Id> registers;
+ std::map<Tegra::Shader::Pred, Id> predicates;
+ Id local_memory{};
+ std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{};
+ std::map<Attribute::Index, Id> input_attributes;
+ std::map<Attribute::Index, Id> output_attributes;
+ std::map<u32, Id> constant_buffers;
+ std::map<GlobalMemoryBase, Id> global_buffers;
+ std::map<u32, SamplerImage> sampler_images;
+
+ Id instance_index{};
+ Id vertex_index{};
+ std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
+ Id frag_depth{};
+ Id frag_coord{};
+ Id front_facing{};
+
+ u32 position_index{};
+ u32 point_size_index{};
+ u32 clip_distances_index{};
+
+ std::vector<Id> interfaces;
+
+ u32 const_buffers_base_binding{};
+ u32 global_buffers_base_binding{};
+ u32 samplers_base_binding{};
+
+ Id execute_function{};
+ Id jmp_to{};
+ Id flow_stack_top{};
+ Id flow_stack{};
+ Id continue_label{};
+ std::map<u32, Id> labels;
+};
+
+DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage) {
+ auto decompiler = std::make_unique<SPIRVDecompiler>(ir, stage);
+ decompiler->Decompile();
+ return {std::move(decompiler), decompiler->GetShaderEntries()};
+}
+
+} // namespace Vulkan::VKShader
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
new file mode 100644
index 000000000..329d8fa38
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -0,0 +1,80 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include <sirit/sirit.h>
+
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+}
+
+namespace Vulkan::VKShader {
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+using SamplerEntry = VideoCommon::Shader::Sampler;
+
+constexpr u32 DESCRIPTOR_SET = 0;
+
+class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer {
+public:
+ explicit constexpr ConstBufferEntry(const VideoCommon::Shader::ConstBuffer& entry, u32 index)
+ : VideoCommon::Shader::ConstBuffer{entry}, index{index} {}
+
+ constexpr u32 GetIndex() const {
+ return index;
+ }
+
+private:
+ u32 index{};
+};
+
+class GlobalBufferEntry {
+public:
+ explicit GlobalBufferEntry(u32 cbuf_index, u32 cbuf_offset)
+ : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
+
+ u32 GetCbufIndex() const {
+ return cbuf_index;
+ }
+
+ u32 GetCbufOffset() const {
+ return cbuf_offset;
+ }
+
+private:
+ u32 cbuf_index{};
+ u32 cbuf_offset{};
+};
+
+struct ShaderEntries {
+ u32 const_buffers_base_binding{};
+ u32 global_buffers_base_binding{};
+ u32 samplers_base_binding{};
+ std::vector<ConstBufferEntry> const_buffers;
+ std::vector<GlobalBufferEntry> global_buffers;
+ std::vector<SamplerEntry> samplers;
+ std::set<u32> attributes;
+ std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+ std::size_t shader_length{};
+ Sirit::Id entry_function{};
+ std::vector<Sirit::Id> interfaces;
+};
+
+using DecompilerResult = std::pair<std::unique_ptr<Sirit::Module>, ShaderEntries>;
+
+DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage);
+
+} // namespace Vulkan::VKShader
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index b2b706cb8..ea1092db1 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -19,6 +19,23 @@ using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
using Tegra::Shader::Register;
+namespace {
+u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
+ switch (uniform_type) {
+ case Tegra::Shader::UniformType::Single:
+ return 1;
+ case Tegra::Shader::UniformType::Double:
+ return 2;
+ case Tegra::Shader::UniformType::Quad:
+ case Tegra::Shader::UniformType::UnsignedQuad:
+ return 4;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
+ return 1;
+ }
+}
+} // namespace
+
u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
@@ -127,45 +144,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
break;
}
case OpCode::Id::LDG: {
- const u32 count = [&]() {
- switch (instr.ldg.type) {
- case Tegra::Shader::UniformType::Single:
- return 1;
- case Tegra::Shader::UniformType::Double:
- return 2;
- case Tegra::Shader::UniformType::Quad:
- case Tegra::Shader::UniformType::UnsignedQuad:
- return 4;
- default:
- UNIMPLEMENTED_MSG("Unimplemented LDG size!");
- return 1;
- }
- }();
-
- const Node addr_register = GetRegister(instr.gpr8);
- const Node base_address =
- TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
- const auto cbuf = std::get_if<CbufNode>(base_address);
- ASSERT(cbuf != nullptr);
- const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
- ASSERT(cbuf_offset_imm != nullptr);
- const auto cbuf_offset = cbuf_offset_imm->GetValue();
-
- bb.push_back(Comment(
- fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
-
- const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
- used_global_memory_bases.insert(descriptor);
-
- const Node immediate_offset =
- Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value()));
- const Node base_real_address =
- Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register);
+ const auto [real_address_base, base_address, descriptor] =
+ TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+ static_cast<u32>(instr.ldg.immediate_offset.Value()), false);
+ const u32 count = GetUniformTypeElementsCount(instr.ldg.type);
for (u32 i = 0; i < count; ++i) {
const Node it_offset = Immediate(i * 4);
const Node real_address =
- Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset);
+ Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
SetTemporal(bb, i, gmem);
@@ -175,6 +162,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
break;
}
+ case OpCode::Id::STG: {
+ const auto [real_address_base, base_address, descriptor] =
+ TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+ static_cast<u32>(instr.stg.immediate_offset.Value()), true);
+
+ // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
+ SetTemporal(bb, 0, real_address_base);
+
+ const u32 count = GetUniformTypeElementsCount(instr.stg.type);
+ for (u32 i = 0; i < count; ++i) {
+ SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+ }
+ for (u32 i = 0; i < count; ++i) {
+ const Node it_offset = Immediate(i * 4);
+ const Node real_address =
+ Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
+ const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+
+ bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+ }
+ break;
+ }
case OpCode::Id::ST_A: {
UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
"Indirect attribute loads are not supported");
@@ -237,4 +246,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
return pc;
}
+std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,
+ Node addr_register,
+ u32 immediate_offset,
+ bool is_write) {
+ const Node base_address{
+ TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
+ const auto cbuf = std::get_if<CbufNode>(base_address);
+ ASSERT(cbuf != nullptr);
+ const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+ ASSERT(cbuf_offset_imm != nullptr);
+ const auto cbuf_offset = cbuf_offset_imm->GetValue();
+
+ bb.push_back(
+ Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+
+ const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+ const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
+ auto& usage = entry->second;
+ if (is_write) {
+ usage.is_written = true;
+ } else {
+ usage.is_read = true;
+ }
+
+ const auto real_address =
+ Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);
+
+ return {real_address, base_address, descriptor};
+}
+
} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index a99ae19bf..a775b402b 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -7,7 +7,9 @@
#include <fmt/format.h>
#include "common/assert.h"
+#include "common/bit_field.h"
#include "common/common_types.h"
+#include "common/logging/log.h"
#include "video_core/engines/shader_bytecode.h"
#include "video_core/shader/shader_ir.h"
@@ -41,19 +43,18 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
switch (opcode->get().GetId()) {
case OpCode::Id::TEX: {
- UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI),
- "AOFFI is not implemented");
-
if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
}
const TextureType texture_type{instr.tex.texture_type};
const bool is_array = instr.tex.array != 0;
+ const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
const bool depth_compare = instr.tex.UsesMiscMode(TextureMiscMode::DC);
const auto process_mode = instr.tex.GetTextureProcessMode();
WriteTexInstructionFloat(
- bb, instr, GetTexCode(instr, texture_type, process_mode, depth_compare, is_array));
+ bb, instr,
+ GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi));
break;
}
case OpCode::Id::TEXS: {
@@ -78,8 +79,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
}
case OpCode::Id::TLD4: {
ASSERT(instr.tld4.array == 0);
- UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI),
- "AOFFI is not implemented");
UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::NDV),
"NDV is not implemented");
UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP),
@@ -92,8 +91,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
const auto texture_type = instr.tld4.texture_type.Value();
const bool depth_compare = instr.tld4.UsesMiscMode(TextureMiscMode::DC);
const bool is_array = instr.tld4.array != 0;
- WriteTexInstructionFloat(bb, instr,
- GetTld4Code(instr, texture_type, depth_compare, is_array));
+ const bool is_aoffi = instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI);
+ WriteTexInstructionFloat(
+ bb, instr, GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi));
break;
}
case OpCode::Id::TLD4S: {
@@ -127,7 +127,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
- MetaTexture meta{sampler, {}, {}, {}, {}, component, element};
+ MetaTexture meta{sampler, {}, {}, {}, {}, {}, component, element};
values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
}
@@ -152,7 +152,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
if (!instr.txq.IsComponentEnabled(element)) {
continue;
}
- MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+ MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
const Node value =
Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8));
SetTemporal(bb, indexer++, value);
@@ -202,7 +202,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
for (u32 element = 0; element < 2; ++element) {
auto params = coords;
- MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+ MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
SetTemporal(bb, element, value);
}
@@ -325,7 +325,8 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
TextureProcessMode process_mode, std::vector<Node> coords,
- Node array, Node depth_compare, u32 bias_offset) {
+ Node array, Node depth_compare, u32 bias_offset,
+ std::vector<Node> aoffi) {
const bool is_array = array;
const bool is_shadow = depth_compare;
@@ -374,7 +375,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto copy_coords = coords;
- MetaTexture meta{sampler, array, depth_compare, bias, lod, {}, element};
+ MetaTexture meta{sampler, array, depth_compare, aoffi, bias, lod, {}, element};
values[element] = Operation(read_method, meta, std::move(copy_coords));
}
@@ -382,9 +383,15 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
}
Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
- TextureProcessMode process_mode, bool depth_compare, bool is_array) {
- const bool lod_bias_enabled =
- (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ);
+ TextureProcessMode process_mode, bool depth_compare, bool is_array,
+ bool is_aoffi) {
+ const bool lod_bias_enabled{
+ (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ)};
+
+ u64 parameter_register = instr.gpr20.Value();
+ if (lod_bias_enabled) {
+ ++parameter_register;
+ }
const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement(
texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5);
@@ -404,15 +411,19 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
const Node array = is_array ? GetRegister(array_register) : nullptr;
+ std::vector<Node> aoffi;
+ if (is_aoffi) {
+ aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false);
+ }
+
Node dc{};
if (depth_compare) {
// Depth is always stored in the register signaled by gpr20 or in the next register if lod
// or bias are used
- const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0);
- dc = GetRegister(depth_register);
+ dc = GetRegister(parameter_register++);
}
- return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0);
+ return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0, aoffi);
}
Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
@@ -448,11 +459,11 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
dc = GetRegister(depth_register);
}
- return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset);
+ return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {});
}
Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare,
- bool is_array) {
+ bool is_array, bool is_aoffi) {
const std::size_t coord_count = GetCoordCount(texture_type);
const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
@@ -463,15 +474,27 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
const u64 coord_register = array_register + (is_array ? 1 : 0);
std::vector<Node> coords;
- for (size_t i = 0; i < coord_count; ++i)
+ for (std::size_t i = 0; i < coord_count; ++i) {
coords.push_back(GetRegister(coord_register + i));
+ }
+
+ u64 parameter_register = instr.gpr20.Value();
+ std::vector<Node> aoffi;
+ if (is_aoffi) {
+ aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, true);
+ }
+
+ Node dc{};
+ if (depth_compare) {
+ dc = GetRegister(parameter_register++);
+ }
const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, depth_compare);
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
- MetaTexture meta{sampler, GetRegister(array_register), {}, {}, {}, {}, element};
+ MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, element};
values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
}
@@ -507,7 +530,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
- MetaTexture meta{sampler, array, {}, {}, lod, {}, element};
+ MetaTexture meta{sampler, array, {}, {}, {}, lod, {}, element};
values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
}
return values;
@@ -531,4 +554,45 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement(
return {coord_count, total_coord_count};
}
-} // namespace VideoCommon::Shader \ No newline at end of file
+std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count,
+ bool is_tld4) {
+ const auto [coord_offsets, size, wrap_value,
+ diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> {
+ if (is_tld4) {
+ return {{0, 8, 16}, 6, 32, 64};
+ } else {
+ return {{0, 4, 8}, 4, 8, 16};
+ }
+ }();
+ const u32 mask = (1U << size) - 1;
+
+ std::vector<Node> aoffi;
+ aoffi.reserve(coord_count);
+
+ const auto aoffi_immediate{
+ TrackImmediate(aoffi_reg, global_code, static_cast<s64>(global_code.size()))};
+ if (!aoffi_immediate) {
+ // Variable access, not supported on AMD.
+ LOG_WARNING(HW_GPU,
+ "AOFFI constant folding failed, some hardware might have graphical issues");
+ for (std::size_t coord = 0; coord < coord_count; ++coord) {
+ const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size);
+ const Node condition =
+ Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value));
+ const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value));
+ aoffi.push_back(Operation(OperationCode::Select, condition, negative, value));
+ }
+ return aoffi;
+ }
+
+ for (std::size_t coord = 0; coord < coord_count; ++coord) {
+ s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask;
+ if (value >= wrap_value) {
+ value -= diff_value;
+ }
+ aoffi.push_back(Immediate(value));
+ }
+ return aoffi;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index c34843307..db15c0718 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -29,39 +29,55 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
const bool is_signed_b = instr.xmad.sign_b == 1;
const bool is_signed_c = is_signed_a;
- auto [is_merge, op_b, op_c] = [&]() -> std::tuple<bool, Node, Node> {
+ auto [is_merge, is_psl, is_high_b, mode, op_b,
+ op_c] = [&]() -> std::tuple<bool, bool, bool, Tegra::Shader::XmadMode, Node, Node> {
switch (opcode->get().GetId()) {
case OpCode::Id::XMAD_CR:
return {instr.xmad.merge_56,
+ instr.xmad.product_shift_left_second,
+ instr.xmad.high_b,
+ instr.xmad.mode_cbf,
GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
GetRegister(instr.gpr39)};
case OpCode::Id::XMAD_RR:
- return {instr.xmad.merge_37, GetRegister(instr.gpr20), GetRegister(instr.gpr39)};
+ return {instr.xmad.merge_37, instr.xmad.product_shift_left, instr.xmad.high_b_rr,
+ instr.xmad.mode, GetRegister(instr.gpr20), GetRegister(instr.gpr39)};
case OpCode::Id::XMAD_RC:
- return {false, GetRegister(instr.gpr39),
+ return {false,
+ false,
+ instr.xmad.high_b,
+ instr.xmad.mode_cbf,
+ GetRegister(instr.gpr39),
GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
case OpCode::Id::XMAD_IMM:
- return {instr.xmad.merge_37, Immediate(static_cast<u32>(instr.xmad.imm20_16)),
+ return {instr.xmad.merge_37,
+ instr.xmad.product_shift_left,
+ false,
+ instr.xmad.mode,
+ Immediate(static_cast<u32>(instr.xmad.imm20_16)),
GetRegister(instr.gpr39)};
}
UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName());
- return {false, Immediate(0), Immediate(0)};
+ return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)};
}();
op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16);
const Node original_b = op_b;
- op_b = BitfieldExtract(op_b, instr.xmad.high_b ? 16 : 0, 16);
+ op_b = BitfieldExtract(op_b, is_high_b ? 16 : 0, 16);
// TODO(Rodrigo): Use an appropiate sign for this operation
Node product = Operation(OperationCode::IMul, NO_PRECISE, op_a, op_b);
- if (instr.xmad.product_shift_left) {
+ if (is_psl) {
product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));
}
+ SetTemporal(bb, 0, product);
+ product = GetTemporal(0);
const Node original_c = op_c;
+ const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error
op_c = [&]() {
- switch (instr.xmad.mode) {
+ switch (set_mode) {
case Tegra::Shader::XmadMode::None:
return original_c;
case Tegra::Shader::XmadMode::CLo:
@@ -80,8 +96,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
}
}();
+ SetTemporal(bb, 1, op_c);
+ op_c = GetTemporal(1);
+
// TODO(Rodrigo): Use an appropiate sign for this operation
Node sum = Operation(OperationCode::IAdd, product, op_c);
+ SetTemporal(bb, 2, sum);
+ sum = GetTemporal(2);
if (is_merge) {
const Node a = BitfieldExtract(sum, 0, 16);
const Node b =
@@ -95,4 +116,4 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
return pc;
}
-} // namespace VideoCommon::Shader \ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 5bc3a3900..1afab08c0 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -7,6 +7,7 @@
#include <array>
#include <cstring>
#include <map>
+#include <optional>
#include <set>
#include <string>
#include <tuple>
@@ -275,6 +276,11 @@ struct GlobalMemoryBase {
}
};
+struct GlobalMemoryUsage {
+ bool is_read{};
+ bool is_written{};
+};
+
struct MetaArithmetic {
bool precise{};
};
@@ -290,6 +296,7 @@ struct MetaTexture {
const Sampler& sampler;
Node array{};
Node depth_compare{};
+ std::vector<Node> aoffi;
Node bias{};
Node lod{};
Node component{};
@@ -576,8 +583,8 @@ public:
return used_clip_distances;
}
- const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const {
- return used_global_memory_bases;
+ const std::map<GlobalMemoryBase, GlobalMemoryUsage>& GetGlobalMemory() const {
+ return used_global_memory;
}
std::size_t GetLength() const {
@@ -741,14 +748,14 @@ private:
Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
- bool is_array);
+ bool is_array, bool is_aoffi);
Node4 GetTexsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
bool is_array);
Node4 GetTld4Code(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
- bool depth_compare, bool is_array);
+ bool depth_compare, bool is_array, bool is_aoffi);
Node4 GetTldsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
bool is_array);
@@ -757,9 +764,11 @@ private:
Tegra::Shader::TextureType texture_type, bool depth_compare, bool is_array,
bool lod_bias_enabled, std::size_t max_coords, std::size_t max_inputs);
+ std::vector<Node> GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4);
+
Node4 GetTextureCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
Tegra::Shader::TextureProcessMode process_mode, std::vector<Node> coords,
- Node array, Node depth_compare, u32 bias_offset);
+ Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi);
Node GetVideoOperand(Node op, bool is_chunk, bool is_signed, Tegra::Shader::VideoType type,
u64 byte_height);
@@ -773,8 +782,15 @@ private:
Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor);
+ std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor);
+
std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
+ std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb,
+ Node addr_register,
+ u32 immediate_offset,
+ bool is_write);
+
template <typename... T>
Node Operation(OperationCode code, const T*... operands) {
return StoreNode(OperationNode(code, operands...));
@@ -828,7 +844,7 @@ private:
std::map<u32, ConstBuffer> used_cbufs;
std::set<Sampler> used_samplers;
std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
- std::set<GlobalMemoryBase> used_global_memory_bases;
+ std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
Tegra::Shader::Header header;
};
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 33b071747..4505667ff 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -6,6 +6,7 @@
#include <utility>
#include <variant>
+#include "common/common_types.h"
#include "video_core/shader/shader_ir.h"
namespace VideoCommon::Shader {
@@ -14,7 +15,7 @@ namespace {
std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
OperationCode operation_code) {
for (; cursor >= 0; --cursor) {
- const Node node = code[cursor];
+ const Node node = code.at(cursor);
if (const auto operation = std::get_if<OperationNode>(node)) {
if (operation->GetCode() == operation_code)
return {node, cursor};
@@ -64,6 +65,20 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
return nullptr;
}
+std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) {
+ // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
+ // that it uses as operand
+ const auto [found, found_cursor] =
+ TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
+ if (!found) {
+ return {};
+ }
+ if (const auto immediate = std::get_if<ImmediateNode>(found)) {
+ return immediate->GetValue();
+ }
+ return {};
+}
+
std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
s64 cursor) {
for (; cursor >= 0; --cursor) {
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index a7ac26d71..3b022a456 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -294,6 +294,8 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
return PixelFormat::Z16;
case Tegra::Texture::TextureFormat::Z24S8:
return PixelFormat::Z24S8;
+ case Tegra::Texture::TextureFormat::ZF32_X24S8:
+ return PixelFormat::Z32FS8;
case Tegra::Texture::TextureFormat::DXT1:
return is_srgb ? PixelFormat::DXT1_SRGB : PixelFormat::DXT1;
case Tegra::Texture::TextureFormat::DXT23:
diff --git a/src/video_core/texture_cache.cpp b/src/video_core/texture_cache.cpp
new file mode 100644
index 000000000..e96eba7cc
--- /dev/null
+++ b/src/video_core/texture_cache.cpp
@@ -0,0 +1,386 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/cityhash.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/surface.h"
+#include "video_core/texture_cache.h"
+#include "video_core/textures/decoders.h"
+#include "video_core/textures/texture.h"
+
+namespace VideoCommon {
+
+using VideoCore::Surface::SurfaceTarget;
+
+using VideoCore::Surface::ComponentTypeFromDepthFormat;
+using VideoCore::Surface::ComponentTypeFromRenderTarget;
+using VideoCore::Surface::ComponentTypeFromTexture;
+using VideoCore::Surface::PixelFormatFromDepthFormat;
+using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
+using VideoCore::Surface::PixelFormatFromTextureFormat;
+using VideoCore::Surface::SurfaceTargetFromTextureType;
+
+constexpr u32 GetMipmapSize(bool uncompressed, u32 mip_size, u32 tile) {
+ return uncompressed ? mip_size : std::max(1U, (mip_size + tile - 1) / tile);
+}
+
+SurfaceParams SurfaceParams::CreateForTexture(Core::System& system,
+ const Tegra::Texture::FullTextureInfo& config) {
+ SurfaceParams params;
+ params.is_tiled = config.tic.IsTiled();
+ params.block_width = params.is_tiled ? config.tic.BlockWidth() : 0,
+ params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0,
+ params.block_depth = params.is_tiled ? config.tic.BlockDepth() : 0,
+ params.tile_width_spacing = params.is_tiled ? (1 << config.tic.tile_width_spacing.Value()) : 1;
+ params.pixel_format =
+ PixelFormatFromTextureFormat(config.tic.format, config.tic.r_type.Value(), false);
+ params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());
+ params.type = GetFormatType(params.pixel_format);
+ params.target = SurfaceTargetFromTextureType(config.tic.texture_type);
+ params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format));
+ params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
+ params.depth = config.tic.Depth();
+ if (params.target == SurfaceTarget::TextureCubemap ||
+ params.target == SurfaceTarget::TextureCubeArray) {
+ params.depth *= 6;
+ }
+ params.pitch = params.is_tiled ? 0 : config.tic.Pitch();
+ params.unaligned_height = config.tic.Height();
+ params.num_levels = config.tic.max_mip_level + 1;
+
+ params.CalculateCachedValues();
+ return params;
+}
+
+SurfaceParams SurfaceParams::CreateForDepthBuffer(
+ Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format,
+ u32 block_width, u32 block_height, u32 block_depth,
+ Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) {
+ SurfaceParams params;
+ params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
+ params.block_width = 1 << std::min(block_width, 5U);
+ params.block_height = 1 << std::min(block_height, 5U);
+ params.block_depth = 1 << std::min(block_depth, 5U);
+ params.tile_width_spacing = 1;
+ params.pixel_format = PixelFormatFromDepthFormat(format);
+ params.component_type = ComponentTypeFromDepthFormat(format);
+ params.type = GetFormatType(params.pixel_format);
+ params.width = zeta_width;
+ params.height = zeta_height;
+ params.unaligned_height = zeta_height;
+ params.target = SurfaceTarget::Texture2D;
+ params.depth = 1;
+ params.num_levels = 1;
+
+ params.CalculateCachedValues();
+ return params;
+}
+
+SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::size_t index) {
+ const auto& config{system.GPU().Maxwell3D().regs.rt[index]};
+ SurfaceParams params;
+ params.is_tiled =
+ config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
+ params.block_width = 1 << config.memory_layout.block_width;
+ params.block_height = 1 << config.memory_layout.block_height;
+ params.block_depth = 1 << config.memory_layout.block_depth;
+ params.tile_width_spacing = 1;
+ params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
+ params.component_type = ComponentTypeFromRenderTarget(config.format);
+ params.type = GetFormatType(params.pixel_format);
+ if (params.is_tiled) {
+ params.width = config.width;
+ } else {
+ const u32 bpp = GetFormatBpp(params.pixel_format) / CHAR_BIT;
+ params.pitch = config.width;
+ params.width = params.pitch / bpp;
+ }
+ params.height = config.height;
+ params.depth = 1;
+ params.unaligned_height = config.height;
+ params.target = SurfaceTarget::Texture2D;
+ params.num_levels = 1;
+
+ params.CalculateCachedValues();
+ return params;
+}
+
+SurfaceParams SurfaceParams::CreateForFermiCopySurface(
+ const Tegra::Engines::Fermi2D::Regs::Surface& config) {
+ SurfaceParams params{};
+ params.is_tiled = !config.linear;
+ params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 32U) : 0,
+ params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 32U) : 0,
+ params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 32U) : 0,
+ params.tile_width_spacing = 1;
+ params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
+ params.component_type = ComponentTypeFromRenderTarget(config.format);
+ params.type = GetFormatType(params.pixel_format);
+ params.width = config.width;
+ params.height = config.height;
+ params.unaligned_height = config.height;
+ // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
+ params.target = SurfaceTarget::Texture2D;
+ params.depth = 1;
+ params.num_levels = 1;
+
+ params.CalculateCachedValues();
+ return params;
+}
+
+u32 SurfaceParams::GetMipWidth(u32 level) const {
+ return std::max(1U, width >> level);
+}
+
+u32 SurfaceParams::GetMipHeight(u32 level) const {
+ return std::max(1U, height >> level);
+}
+
+u32 SurfaceParams::GetMipDepth(u32 level) const {
+ return IsLayered() ? depth : std::max(1U, depth >> level);
+}
+
+bool SurfaceParams::IsLayered() const {
+ switch (target) {
+ case SurfaceTarget::Texture1DArray:
+ case SurfaceTarget::Texture2DArray:
+ case SurfaceTarget::TextureCubeArray:
+ case SurfaceTarget::TextureCubemap:
+ return true;
+ default:
+ return false;
+ }
+}
+
+u32 SurfaceParams::GetMipBlockHeight(u32 level) const {
+ // Auto block resizing algorithm from:
+ // https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+ if (level == 0) {
+ return block_height;
+ }
+ const u32 height{GetMipHeight(level)};
+ const u32 default_block_height{GetDefaultBlockHeight(pixel_format)};
+ const u32 blocks_in_y{(height + default_block_height - 1) / default_block_height};
+ u32 block_height = 16;
+ while (block_height > 1 && blocks_in_y <= block_height * 4) {
+ block_height >>= 1;
+ }
+ return block_height;
+}
+
+u32 SurfaceParams::GetMipBlockDepth(u32 level) const {
+ if (level == 0)
+ return block_depth;
+ if (target != SurfaceTarget::Texture3D)
+ return 1;
+
+ const u32 depth{GetMipDepth(level)};
+ u32 block_depth = 32;
+ while (block_depth > 1 && depth * 2 <= block_depth) {
+ block_depth >>= 1;
+ }
+ if (block_depth == 32 && GetMipBlockHeight(level) >= 4) {
+ return 16;
+ }
+ return block_depth;
+}
+
+std::size_t SurfaceParams::GetGuestMipmapLevelOffset(u32 level) const {
+ std::size_t offset = 0;
+ for (u32 i = 0; i < level; i++) {
+ offset += GetInnerMipmapMemorySize(i, false, IsLayered(), false);
+ }
+ return offset;
+}
+
+std::size_t SurfaceParams::GetHostMipmapLevelOffset(u32 level) const {
+ std::size_t offset = 0;
+ for (u32 i = 0; i < level; i++) {
+ offset += GetInnerMipmapMemorySize(i, true, false, false);
+ }
+ return offset;
+}
+
+std::size_t SurfaceParams::GetGuestLayerSize() const {
+ return GetInnerMemorySize(false, true, false);
+}
+
+std::size_t SurfaceParams::GetHostLayerSize(u32 level) const {
+ return GetInnerMipmapMemorySize(level, true, IsLayered(), false);
+}
+
+bool SurfaceParams::IsFamiliar(const SurfaceParams& view_params) const {
+ if (std::tie(is_tiled, tile_width_spacing, pixel_format, component_type, type) !=
+ std::tie(view_params.is_tiled, view_params.tile_width_spacing, view_params.pixel_format,
+ view_params.component_type, view_params.type)) {
+ return false;
+ }
+
+ const SurfaceTarget view_target{view_params.target};
+ if (view_target == target) {
+ return true;
+ }
+
+ switch (target) {
+ case SurfaceTarget::Texture1D:
+ case SurfaceTarget::Texture2D:
+ case SurfaceTarget::Texture3D:
+ return false;
+ case SurfaceTarget::Texture1DArray:
+ return view_target == SurfaceTarget::Texture1D;
+ case SurfaceTarget::Texture2DArray:
+ return view_target == SurfaceTarget::Texture2D;
+ case SurfaceTarget::TextureCubemap:
+ return view_target == SurfaceTarget::Texture2D ||
+ view_target == SurfaceTarget::Texture2DArray;
+ case SurfaceTarget::TextureCubeArray:
+ return view_target == SurfaceTarget::Texture2D ||
+ view_target == SurfaceTarget::Texture2DArray ||
+ view_target == SurfaceTarget::TextureCubemap;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented texture family={}", static_cast<u32>(target));
+ return false;
+ }
+}
+
+bool SurfaceParams::IsPixelFormatZeta() const {
+ return pixel_format >= VideoCore::Surface::PixelFormat::MaxColorFormat &&
+ pixel_format < VideoCore::Surface::PixelFormat::MaxDepthStencilFormat;
+}
+
+void SurfaceParams::CalculateCachedValues() {
+ guest_size_in_bytes = GetInnerMemorySize(false, false, false);
+
+ // ASTC is uncompressed in software, in emulated as RGBA8
+ if (IsPixelFormatASTC(pixel_format)) {
+ host_size_in_bytes = width * height * depth * 4;
+ } else {
+ host_size_in_bytes = GetInnerMemorySize(true, false, false);
+ }
+
+ switch (target) {
+ case SurfaceTarget::Texture1D:
+ case SurfaceTarget::Texture2D:
+ case SurfaceTarget::Texture3D:
+ num_layers = 1;
+ break;
+ case SurfaceTarget::Texture1DArray:
+ case SurfaceTarget::Texture2DArray:
+ case SurfaceTarget::TextureCubemap:
+ case SurfaceTarget::TextureCubeArray:
+ num_layers = depth;
+ break;
+ default:
+ UNREACHABLE();
+ }
+}
+
+std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size, bool layer_only,
+ bool uncompressed) const {
+ const bool tiled{as_host_size ? false : is_tiled};
+ const u32 tile_x{GetDefaultBlockWidth(pixel_format)};
+ const u32 tile_y{GetDefaultBlockHeight(pixel_format)};
+ const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), tile_x)};
+ const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), tile_y)};
+ const u32 depth{layer_only ? 1U : GetMipDepth(level)};
+ return Tegra::Texture::CalculateSize(tiled, GetBytesPerPixel(pixel_format), width, height,
+ depth, GetMipBlockHeight(level), GetMipBlockDepth(level));
+}
+
+std::size_t SurfaceParams::GetInnerMemorySize(bool as_host_size, bool layer_only,
+ bool uncompressed) const {
+ std::size_t size = 0;
+ for (u32 level = 0; level < num_levels; ++level) {
+ size += GetInnerMipmapMemorySize(level, as_host_size, layer_only, uncompressed);
+ }
+ if (!as_host_size && is_tiled) {
+ size = Common::AlignUp(size, Tegra::Texture::GetGOBSize() * block_height * block_depth);
+ }
+ return size;
+}
+
+std::map<u64, std::pair<u32, u32>> SurfaceParams::CreateViewOffsetMap() const {
+ std::map<u64, std::pair<u32, u32>> view_offset_map;
+ switch (target) {
+ case SurfaceTarget::Texture1D:
+ case SurfaceTarget::Texture2D:
+ case SurfaceTarget::Texture3D: {
+ constexpr u32 layer = 0;
+ for (u32 level = 0; level < num_levels; ++level) {
+ const std::size_t offset{GetGuestMipmapLevelOffset(level)};
+ view_offset_map.insert({offset, {layer, level}});
+ }
+ break;
+ }
+ case SurfaceTarget::Texture1DArray:
+ case SurfaceTarget::Texture2DArray:
+ case SurfaceTarget::TextureCubemap:
+ case SurfaceTarget::TextureCubeArray: {
+ const std::size_t layer_size{GetGuestLayerSize()};
+ for (u32 level = 0; level < num_levels; ++level) {
+ const std::size_t level_offset{GetGuestMipmapLevelOffset(level)};
+ for (u32 layer = 0; layer < num_layers; ++layer) {
+ const auto layer_offset{static_cast<std::size_t>(layer_size * layer)};
+ const std::size_t offset{level_offset + layer_offset};
+ view_offset_map.insert({offset, {layer, level}});
+ }
+ }
+ break;
+ }
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented surface target {}", static_cast<u32>(target));
+ }
+ return view_offset_map;
+}
+
+bool SurfaceParams::IsViewValid(const SurfaceParams& view_params, u32 layer, u32 level) const {
+ return IsDimensionValid(view_params, level) && IsDepthValid(view_params, level) &&
+ IsInBounds(view_params, layer, level);
+}
+
+bool SurfaceParams::IsDimensionValid(const SurfaceParams& view_params, u32 level) const {
+ return view_params.width == GetMipWidth(level) && view_params.height == GetMipHeight(level);
+}
+
+bool SurfaceParams::IsDepthValid(const SurfaceParams& view_params, u32 level) const {
+ if (view_params.target != SurfaceTarget::Texture3D) {
+ return true;
+ }
+ return view_params.depth == GetMipDepth(level);
+}
+
+bool SurfaceParams::IsInBounds(const SurfaceParams& view_params, u32 layer, u32 level) const {
+ return layer + view_params.num_layers <= num_layers &&
+ level + view_params.num_levels <= num_levels;
+}
+
+std::size_t HasheableSurfaceParams::Hash() const {
+ return static_cast<std::size_t>(
+ Common::CityHash64(reinterpret_cast<const char*>(this), sizeof(*this)));
+}
+
+bool HasheableSurfaceParams::operator==(const HasheableSurfaceParams& rhs) const {
+ return std::tie(is_tiled, block_width, block_height, block_depth, tile_width_spacing, width,
+ height, depth, pitch, unaligned_height, num_levels, pixel_format,
+ component_type, type, target) ==
+ std::tie(rhs.is_tiled, rhs.block_width, rhs.block_height, rhs.block_depth,
+ rhs.tile_width_spacing, rhs.width, rhs.height, rhs.depth, rhs.pitch,
+ rhs.unaligned_height, rhs.num_levels, rhs.pixel_format, rhs.component_type,
+ rhs.type, rhs.target);
+}
+
+std::size_t ViewKey::Hash() const {
+ return static_cast<std::size_t>(
+ Common::CityHash64(reinterpret_cast<const char*>(this), sizeof(*this)));
+}
+
+bool ViewKey::operator==(const ViewKey& rhs) const {
+ return std::tie(base_layer, num_layers, base_level, num_levels) ==
+ std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels);
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache.h b/src/video_core/texture_cache.h
new file mode 100644
index 000000000..041551691
--- /dev/null
+++ b/src/video_core/texture_cache.h
@@ -0,0 +1,586 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/range/iterator_range.hpp>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/memory.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/surface.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra::Texture {
+struct FullTextureInfo;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace VideoCommon {
+
+class HasheableSurfaceParams {
+public:
+ std::size_t Hash() const;
+
+ bool operator==(const HasheableSurfaceParams& rhs) const;
+
+protected:
+ // Avoid creation outside of a managed environment.
+ HasheableSurfaceParams() = default;
+
+ bool is_tiled;
+ u32 block_width;
+ u32 block_height;
+ u32 block_depth;
+ u32 tile_width_spacing;
+ u32 width;
+ u32 height;
+ u32 depth;
+ u32 pitch;
+ u32 unaligned_height;
+ u32 num_levels;
+ VideoCore::Surface::PixelFormat pixel_format;
+ VideoCore::Surface::ComponentType component_type;
+ VideoCore::Surface::SurfaceType type;
+ VideoCore::Surface::SurfaceTarget target;
+};
+
+class SurfaceParams final : public HasheableSurfaceParams {
+public:
+ /// Creates SurfaceCachedParams from a texture configuration.
+ static SurfaceParams CreateForTexture(Core::System& system,
+ const Tegra::Texture::FullTextureInfo& config);
+
+ /// Creates SurfaceCachedParams for a depth buffer configuration.
+ static SurfaceParams CreateForDepthBuffer(
+ Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format,
+ u32 block_width, u32 block_height, u32 block_depth,
+ Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type);
+
+ /// Creates SurfaceCachedParams from a framebuffer configuration.
+ static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index);
+
+ /// Creates SurfaceCachedParams from a Fermi2D surface configuration.
+ static SurfaceParams CreateForFermiCopySurface(
+ const Tegra::Engines::Fermi2D::Regs::Surface& config);
+
+ bool IsTiled() const {
+ return is_tiled;
+ }
+
+ u32 GetBlockWidth() const {
+ return block_width;
+ }
+
+ u32 GetTileWidthSpacing() const {
+ return tile_width_spacing;
+ }
+
+ u32 GetWidth() const {
+ return width;
+ }
+
+ u32 GetHeight() const {
+ return height;
+ }
+
+ u32 GetDepth() const {
+ return depth;
+ }
+
+ u32 GetPitch() const {
+ return pitch;
+ }
+
+ u32 GetNumLevels() const {
+ return num_levels;
+ }
+
+ VideoCore::Surface::PixelFormat GetPixelFormat() const {
+ return pixel_format;
+ }
+
+ VideoCore::Surface::ComponentType GetComponentType() const {
+ return component_type;
+ }
+
+ VideoCore::Surface::SurfaceTarget GetTarget() const {
+ return target;
+ }
+
+ VideoCore::Surface::SurfaceType GetType() const {
+ return type;
+ }
+
+ std::size_t GetGuestSizeInBytes() const {
+ return guest_size_in_bytes;
+ }
+
+ std::size_t GetHostSizeInBytes() const {
+ return host_size_in_bytes;
+ }
+
+ u32 GetNumLayers() const {
+ return num_layers;
+ }
+
+ /// Returns the width of a given mipmap level.
+ u32 GetMipWidth(u32 level) const;
+
+ /// Returns the height of a given mipmap level.
+ u32 GetMipHeight(u32 level) const;
+
+ /// Returns the depth of a given mipmap level.
+ u32 GetMipDepth(u32 level) const;
+
+ /// Returns true if these parameters are from a layered surface.
+ bool IsLayered() const;
+
+ /// Returns the block height of a given mipmap level.
+ u32 GetMipBlockHeight(u32 level) const;
+
+ /// Returns the block depth of a given mipmap level.
+ u32 GetMipBlockDepth(u32 level) const;
+
+ /// Returns the offset in bytes in guest memory of a given mipmap level.
+ std::size_t GetGuestMipmapLevelOffset(u32 level) const;
+
+ /// Returns the offset in bytes in host memory (linear) of a given mipmap level.
+ std::size_t GetHostMipmapLevelOffset(u32 level) const;
+
+ /// Returns the size of a layer in bytes in guest memory.
+ std::size_t GetGuestLayerSize() const;
+
+ /// Returns the size of a layer in bytes in host memory for a given mipmap level.
+ std::size_t GetHostLayerSize(u32 level) const;
+
+ /// Returns true if another surface can be familiar with this. This is a loosely defined term
+ /// that reflects the possibility of these two surface parameters potentially being part of a
+ /// bigger superset.
+ bool IsFamiliar(const SurfaceParams& view_params) const;
+
+ /// Returns true if the pixel format is a depth and/or stencil format.
+ bool IsPixelFormatZeta() const;
+
+ /// Creates a map that redirects an address difference to a layer and mipmap level.
+ std::map<u64, std::pair<u32, u32>> CreateViewOffsetMap() const;
+
+ /// Returns true if the passed surface view parameters is equal or a valid subset of this.
+ bool IsViewValid(const SurfaceParams& view_params, u32 layer, u32 level) const;
+
+private:
+ /// Calculates values that can be deduced from HasheableSurfaceParams.
+ void CalculateCachedValues();
+
+ /// Returns the size of a given mipmap level.
+ std::size_t GetInnerMipmapMemorySize(u32 level, bool as_host_size, bool layer_only,
+ bool uncompressed) const;
+
+ /// Returns the size of all mipmap levels and aligns as needed.
+ std::size_t GetInnerMemorySize(bool as_host_size, bool layer_only, bool uncompressed) const;
+
+ /// Returns true if the passed view width and height match the size of this params in a given
+ /// mipmap level.
+ bool IsDimensionValid(const SurfaceParams& view_params, u32 level) const;
+
+ /// Returns true if the passed view depth match the size of this params in a given mipmap level.
+ bool IsDepthValid(const SurfaceParams& view_params, u32 level) const;
+
+ /// Returns true if the passed view layers and mipmap levels are in bounds.
+ bool IsInBounds(const SurfaceParams& view_params, u32 layer, u32 level) const;
+
+ std::size_t guest_size_in_bytes;
+ std::size_t host_size_in_bytes;
+ u32 num_layers;
+};
+
+struct ViewKey {
+ std::size_t Hash() const;
+
+ bool operator==(const ViewKey& rhs) const;
+
+ u32 base_layer{};
+ u32 num_layers{};
+ u32 base_level{};
+ u32 num_levels{};
+};
+
+} // namespace VideoCommon
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::SurfaceParams> {
+ std::size_t operator()(const VideoCommon::SurfaceParams& k) const noexcept {
+ return k.Hash();
+ }
+};
+
+template <>
+struct hash<VideoCommon::ViewKey> {
+ std::size_t operator()(const VideoCommon::ViewKey& k) const noexcept {
+ return k.Hash();
+ }
+};
+
+} // namespace std
+
+namespace VideoCommon {
+
+template <typename TView, typename TExecutionContext>
+class SurfaceBase {
+ static_assert(std::is_trivially_copyable_v<TExecutionContext>);
+
+public:
+ virtual void LoadBuffer() = 0;
+
+ virtual TExecutionContext FlushBuffer(TExecutionContext exctx) = 0;
+
+ virtual TExecutionContext UploadTexture(TExecutionContext exctx) = 0;
+
+ TView* TryGetView(VAddr view_addr, const SurfaceParams& view_params) {
+ if (view_addr < cpu_addr || !params.IsFamiliar(view_params)) {
+ // It can't be a view if it's in a prior address.
+ return {};
+ }
+
+ const auto relative_offset{static_cast<u64>(view_addr - cpu_addr)};
+ const auto it{view_offset_map.find(relative_offset)};
+ if (it == view_offset_map.end()) {
+ // Couldn't find an aligned view.
+ return {};
+ }
+ const auto [layer, level] = it->second;
+
+ if (!params.IsViewValid(view_params, layer, level)) {
+ return {};
+ }
+
+ return GetView(layer, view_params.GetNumLayers(), level, view_params.GetNumLevels());
+ }
+
+ VAddr GetCpuAddr() const {
+ ASSERT(is_registered);
+ return cpu_addr;
+ }
+
+ u8* GetHostPtr() const {
+ ASSERT(is_registered);
+ return host_ptr;
+ }
+
+ CacheAddr GetCacheAddr() const {
+ ASSERT(is_registered);
+ return cache_addr;
+ }
+
+ std::size_t GetSizeInBytes() const {
+ return params.GetGuestSizeInBytes();
+ }
+
+ void MarkAsModified(bool is_modified_) {
+ is_modified = is_modified_;
+ }
+
+ const SurfaceParams& GetSurfaceParams() const {
+ return params;
+ }
+
+ TView* GetView(VAddr view_addr, const SurfaceParams& view_params) {
+ TView* view{TryGetView(view_addr, view_params)};
+ ASSERT(view != nullptr);
+ return view;
+ }
+
+ void Register(VAddr cpu_addr_, u8* host_ptr_) {
+ ASSERT(!is_registered);
+ is_registered = true;
+ cpu_addr = cpu_addr_;
+ host_ptr = host_ptr_;
+ cache_addr = ToCacheAddr(host_ptr_);
+ }
+
+ void Register(VAddr cpu_addr_) {
+ Register(cpu_addr_, Memory::GetPointer(cpu_addr_));
+ }
+
+ void Unregister() {
+ ASSERT(is_registered);
+ is_registered = false;
+ }
+
+ bool IsRegistered() const {
+ return is_registered;
+ }
+
+protected:
+ explicit SurfaceBase(const SurfaceParams& params)
+ : params{params}, view_offset_map{params.CreateViewOffsetMap()} {}
+
+ ~SurfaceBase() = default;
+
+ virtual std::unique_ptr<TView> CreateView(const ViewKey& view_key) = 0;
+
+ bool IsModified() const {
+ return is_modified;
+ }
+
+ const SurfaceParams params;
+
+private:
+ TView* GetView(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels) {
+ const ViewKey key{base_layer, num_layers, base_level, num_levels};
+ const auto [entry, is_cache_miss] = views.try_emplace(key);
+ auto& view{entry->second};
+ if (is_cache_miss) {
+ view = CreateView(key);
+ }
+ return view.get();
+ }
+
+ const std::map<u64, std::pair<u32, u32>> view_offset_map;
+
+ VAddr cpu_addr{};
+ u8* host_ptr{};
+ CacheAddr cache_addr{};
+ bool is_modified{};
+ bool is_registered{};
+ std::unordered_map<ViewKey, std::unique_ptr<TView>> views;
+};
+
+template <typename TSurface, typename TView, typename TExecutionContext>
+class TextureCache {
+ static_assert(std::is_trivially_copyable_v<TExecutionContext>);
+ using ResultType = std::tuple<TView*, TExecutionContext>;
+ using IntervalMap = boost::icl::interval_map<CacheAddr, std::set<TSurface*>>;
+ using IntervalType = typename IntervalMap::interval_type;
+
+public:
+ void InvalidateRegion(CacheAddr addr, std::size_t size) {
+ for (TSurface* surface : GetSurfacesInRegion(addr, size)) {
+ if (!surface->IsRegistered()) {
+ // Skip duplicates
+ continue;
+ }
+ Unregister(surface);
+ }
+ }
+
+ ResultType GetTextureSurface(TExecutionContext exctx,
+ const Tegra::Texture::FullTextureInfo& config) {
+ auto& memory_manager{system.GPU().MemoryManager()};
+ const auto cpu_addr{memory_manager.GpuToCpuAddress(config.tic.Address())};
+ if (!cpu_addr) {
+ return {{}, exctx};
+ }
+ const auto params{SurfaceParams::CreateForTexture(system, config)};
+ return GetSurfaceView(exctx, *cpu_addr, params, true);
+ }
+
+ ResultType GetDepthBufferSurface(TExecutionContext exctx, bool preserve_contents) {
+ const auto& regs{system.GPU().Maxwell3D().regs};
+ if (!regs.zeta.Address() || !regs.zeta_enable) {
+ return {{}, exctx};
+ }
+
+ auto& memory_manager{system.GPU().MemoryManager()};
+ const auto cpu_addr{memory_manager.GpuToCpuAddress(regs.zeta.Address())};
+ if (!cpu_addr) {
+ return {{}, exctx};
+ }
+
+ const auto depth_params{SurfaceParams::CreateForDepthBuffer(
+ system, regs.zeta_width, regs.zeta_height, regs.zeta.format,
+ regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height,
+ regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
+ return GetSurfaceView(exctx, *cpu_addr, depth_params, preserve_contents);
+ }
+
+ ResultType GetColorBufferSurface(TExecutionContext exctx, std::size_t index,
+ bool preserve_contents) {
+ ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
+
+ const auto& regs{system.GPU().Maxwell3D().regs};
+ if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
+ regs.rt[index].format == Tegra::RenderTargetFormat::NONE) {
+ return {{}, exctx};
+ }
+
+ auto& memory_manager{system.GPU().MemoryManager()};
+ const auto& config{system.GPU().Maxwell3D().regs.rt[index]};
+ const auto cpu_addr{memory_manager.GpuToCpuAddress(
+ config.Address() + config.base_layer * config.layer_stride * sizeof(u32))};
+ if (!cpu_addr) {
+ return {{}, exctx};
+ }
+
+ return GetSurfaceView(exctx, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
+ preserve_contents);
+ }
+
+ ResultType GetFermiSurface(TExecutionContext exctx,
+ const Tegra::Engines::Fermi2D::Regs::Surface& config) {
+ const auto cpu_addr{system.GPU().MemoryManager().GpuToCpuAddress(config.Address())};
+ ASSERT(cpu_addr);
+ return GetSurfaceView(exctx, *cpu_addr, SurfaceParams::CreateForFermiCopySurface(config),
+ true);
+ }
+
+ TSurface* TryFindFramebufferSurface(const u8* host_ptr) const {
+ const auto it{registered_surfaces.find(ToCacheAddr(host_ptr))};
+ return it != registered_surfaces.end() ? *it->second.begin() : nullptr;
+ }
+
+protected:
+ TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+ : system{system}, rasterizer{rasterizer} {}
+
+ ~TextureCache() = default;
+
+ virtual ResultType TryFastGetSurfaceView(TExecutionContext exctx, VAddr cpu_addr, u8* host_ptr,
+ const SurfaceParams& params, bool preserve_contents,
+ const std::vector<TSurface*>& overlaps) = 0;
+
+ virtual std::unique_ptr<TSurface> CreateSurface(const SurfaceParams& params) = 0;
+
+ void Register(TSurface* surface, VAddr cpu_addr, u8* host_ptr) {
+ surface->Register(cpu_addr, host_ptr);
+ registered_surfaces.add({GetSurfaceInterval(surface), {surface}});
+ rasterizer.UpdatePagesCachedCount(surface->GetCpuAddr(), surface->GetSizeInBytes(), 1);
+ }
+
+ void Unregister(TSurface* surface) {
+ registered_surfaces.subtract({GetSurfaceInterval(surface), {surface}});
+ rasterizer.UpdatePagesCachedCount(surface->GetCpuAddr(), surface->GetSizeInBytes(), -1);
+ surface->Unregister();
+ }
+
+ TSurface* GetUncachedSurface(const SurfaceParams& params) {
+ if (TSurface* surface = TryGetReservedSurface(params); surface)
+ return surface;
+ // No reserved surface available, create a new one and reserve it
+ auto new_surface{CreateSurface(params)};
+ TSurface* surface{new_surface.get()};
+ ReserveSurface(params, std::move(new_surface));
+ return surface;
+ }
+
+ Core::System& system;
+
+private:
+ ResultType GetSurfaceView(TExecutionContext exctx, VAddr cpu_addr, const SurfaceParams& params,
+ bool preserve_contents) {
+ const auto host_ptr{Memory::GetPointer(cpu_addr)};
+ const auto cache_addr{ToCacheAddr(host_ptr)};
+ const auto overlaps{GetSurfacesInRegion(cache_addr, params.GetGuestSizeInBytes())};
+ if (overlaps.empty()) {
+ return LoadSurfaceView(exctx, cpu_addr, host_ptr, params, preserve_contents);
+ }
+
+ if (overlaps.size() == 1) {
+ if (TView* view = overlaps[0]->TryGetView(cpu_addr, params); view)
+ return {view, exctx};
+ }
+
+ TView* fast_view;
+ std::tie(fast_view, exctx) =
+ TryFastGetSurfaceView(exctx, cpu_addr, host_ptr, params, preserve_contents, overlaps);
+
+ for (TSurface* surface : overlaps) {
+ if (!fast_view) {
+ // Flush even when we don't care about the contents, to preserve memory not written
+ // by the new surface.
+ exctx = surface->FlushBuffer(exctx);
+ }
+ Unregister(surface);
+ }
+
+ if (fast_view) {
+ return {fast_view, exctx};
+ }
+
+ return LoadSurfaceView(exctx, cpu_addr, host_ptr, params, preserve_contents);
+ }
+
+ ResultType LoadSurfaceView(TExecutionContext exctx, VAddr cpu_addr, u8* host_ptr,
+ const SurfaceParams& params, bool preserve_contents) {
+ TSurface* new_surface{GetUncachedSurface(params)};
+ Register(new_surface, cpu_addr, host_ptr);
+ if (preserve_contents) {
+ exctx = LoadSurface(exctx, new_surface);
+ }
+ return {new_surface->GetView(cpu_addr, params), exctx};
+ }
+
+ TExecutionContext LoadSurface(TExecutionContext exctx, TSurface* surface) {
+ surface->LoadBuffer();
+ exctx = surface->UploadTexture(exctx);
+ surface->MarkAsModified(false);
+ return exctx;
+ }
+
+ std::vector<TSurface*> GetSurfacesInRegion(CacheAddr cache_addr, std::size_t size) const {
+ if (size == 0) {
+ return {};
+ }
+ const IntervalType interval{cache_addr, cache_addr + size};
+
+ std::vector<TSurface*> surfaces;
+ for (auto& pair : boost::make_iterator_range(registered_surfaces.equal_range(interval))) {
+ surfaces.push_back(*pair.second.begin());
+ }
+ return surfaces;
+ }
+
+ void ReserveSurface(const SurfaceParams& params, std::unique_ptr<TSurface> surface) {
+ surface_reserve[params].push_back(std::move(surface));
+ }
+
+ TSurface* TryGetReservedSurface(const SurfaceParams& params) {
+ auto search{surface_reserve.find(params)};
+ if (search == surface_reserve.end()) {
+ return {};
+ }
+ for (auto& surface : search->second) {
+ if (!surface->IsRegistered()) {
+ return surface.get();
+ }
+ }
+ return {};
+ }
+
+ IntervalType GetSurfaceInterval(TSurface* surface) const {
+ return IntervalType::right_open(surface->GetCacheAddr(),
+ surface->GetCacheAddr() + surface->GetSizeInBytes());
+ }
+
+ VideoCore::RasterizerInterface& rasterizer;
+
+ IntervalMap registered_surfaces;
+
+ /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have
+ /// previously been used. This is to prevent surfaces from being constantly created and
+ /// destroyed when used with different surface parameters.
+ std::unordered_map<SurfaceParams, std::list<std::unique_ptr<TSurface>>> surface_reserve;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp
index 5e439f036..82050bd51 100644
--- a/src/video_core/textures/convert.cpp
+++ b/src/video_core/textures/convert.cpp
@@ -10,6 +10,7 @@
#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
+#include "video_core/surface.h"
#include "video_core/textures/astc.h"
#include "video_core/textures/convert.h"
diff --git a/src/video_core/textures/convert.h b/src/video_core/textures/convert.h
index 07cd8b5da..12542e71c 100644
--- a/src/video_core/textures/convert.h
+++ b/src/video_core/textures/convert.h
@@ -5,7 +5,10 @@
#pragma once
#include "common/common_types.h"
-#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+enum class PixelFormat;
+}
namespace Tegra::Texture {
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 93ecc6e31..bea0d5bc2 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -7,9 +7,7 @@
#include <array>
#include "common/assert.h"
#include "common/bit_field.h"
-#include "common/common_funcs.h"
#include "common/common_types.h"
-#include "video_core/memory_manager.h"
namespace Tegra::Texture {