summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/engines/maxwell_3d.cpp2
-rw-r--r--src/video_core/engines/shader_bytecode.h31
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp110
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h4
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp50
-rw-r--r--src/video_core/shader/decode/arithmetic_integer.cpp29
-rw-r--r--src/video_core/shader/decode/memory.cpp48
-rw-r--r--src/video_core/shader/decode/warp.cpp47
-rw-r--r--src/video_core/shader/node.h26
-rw-r--r--src/video_core/shader/shader_ir.cpp9
-rw-r--r--src/video_core/shader/shader_ir.h6
12 files changed, 322 insertions, 43 deletions
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index c7a3c85a0..fb3d1112c 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -541,7 +541,7 @@ void Maxwell3D::ProcessSyncPoint() {
}
void Maxwell3D::DrawArrays() {
- LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+ LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 052e6d24e..28272ef6f 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 {
Exch = 8,
};
+enum class ShuffleOperation : u64 {
+ Idx = 0, // shuffleNV
+ Up = 1, // shuffleUpNV
+ Down = 2, // shuffleDownNV
+ Bfly = 3, // shuffleXorNV
+};
+
union Instruction {
Instruction& operator=(const Instruction& instr) {
value = instr.value;
@@ -600,6 +607,15 @@ union Instruction {
} vote;
union {
+ BitField<30, 2, ShuffleOperation> operation;
+ BitField<48, 3, u64> pred48;
+ BitField<28, 1, u64> is_index_imm;
+ BitField<29, 1, u64> is_mask_imm;
+ BitField<20, 5, u64> index_imm;
+ BitField<34, 13, u64> mask_imm;
+ } shfl;
+
+ union {
BitField<8, 8, Register> gpr;
BitField<20, 24, s64> offset;
} gmem;
@@ -934,6 +950,11 @@ union Instruction {
} isetp;
union {
+ BitField<48, 1, u64> is_signed;
+ BitField<49, 3, PredCondition> cond;
+ } icmp;
+
+ union {
BitField<0, 3, u64> pred0;
BitField<3, 3, u64> pred3;
BitField<12, 3, u64> pred12;
@@ -1542,6 +1563,7 @@ public:
BRK,
DEPBAR,
VOTE,
+ SHFL,
BFE_C,
BFE_R,
BFE_IMM,
@@ -1628,6 +1650,10 @@ public:
SEL_C,
SEL_R,
SEL_IMM,
+ ICMP_RC,
+ ICMP_R,
+ ICMP_CR,
+ ICMP_IMM,
MUFU, // Multi-Function Operator
RRO_C, // Range Reduction Operator
RRO_R,
@@ -1833,6 +1859,7 @@ private:
INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
+ INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"),
INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
@@ -1892,6 +1919,10 @@ private:
INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
+ INST("010100110100----", Id::ICMP_RC, Type::ArithmeticInteger, "ICMP_RC"),
+ INST("010110110100----", Id::ICMP_R, Type::ArithmeticInteger, "ICMP_R"),
+ INST("010010110100----", Id::ICMP_CR, Type::ArithmeticInteger, "ICMP_CR"),
+ INST("0011011-0100----", Id::ICMP_IMM, Type::ArithmeticInteger, "ICMP_IMM"),
INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"),
INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"),
INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"),
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 909ccb82c..0dbc4c02f 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
std::string source = "#version 430 core\n"
"#extension GL_ARB_separate_shader_objects : enable\n"
"#extension GL_NV_gpu_shader5 : enable\n"
- "#extension GL_NV_shader_thread_group : enable\n";
+ "#extension GL_NV_shader_thread_group : enable\n"
+ "#extension GL_NV_shader_thread_shuffle : enable\n";
if (entries.shader_viewport_layer_array) {
source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 137b23740..76439e7ab 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -325,6 +325,7 @@ public:
DeclareRegisters();
DeclarePredicates();
DeclareLocalMemory();
+ DeclareSharedMemory();
DeclareInternalFlags();
DeclareInputAttributes();
DeclareOutputAttributes();
@@ -499,6 +500,13 @@ private:
code.AddNewLine();
}
+ void DeclareSharedMemory() {
+ if (stage != ProgramType::Compute) {
+ return;
+ }
+ code.AddLine("shared uint {}[];", GetSharedMemory());
+ }
+
void DeclareInternalFlags() {
for (u32 flag = 0; flag < static_cast<u32>(InternalFlag::Amount); flag++) {
const auto flag_code = static_cast<InternalFlag>(flag);
@@ -881,6 +889,12 @@ private:
Type::Uint};
}
+ if (const auto smem = std::get_if<SmemNode>(&*node)) {
+ return {
+ fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()),
+ Type::Uint};
+ }
+
if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
return {GetInternalFlag(internal_flag->GetFlag()), Type::Bool};
}
@@ -1007,10 +1021,10 @@ private:
return {std::move(temporary), value.GetType()};
}
- Expression GetOutputAttribute(const AbufNode* abuf) {
+ std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) {
switch (const auto attribute = abuf->GetIndex()) {
case Attribute::Index::Position:
- return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float};
+ return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}};
case Attribute::Index::LayerViewportPointSize:
switch (abuf->GetElement()) {
case 0:
@@ -1020,25 +1034,25 @@ private:
if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
return {};
}
- return {"gl_Layer", Type::Int};
+ return {{"gl_Layer", Type::Int}};
case 2:
if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
return {};
}
- return {"gl_ViewportIndex", Type::Int};
+ return {{"gl_ViewportIndex", Type::Int}};
case 3:
UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
- return {"gl_PointSize", Type::Float};
+ return {{"gl_PointSize", Type::Float}};
}
return {};
case Attribute::Index::ClipDistances0123:
- return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float};
+ return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}};
case Attribute::Index::ClipDistances4567:
- return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float};
+ return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
default:
if (IsGenericAttribute(attribute)) {
- return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()),
- Type::Float};
+ return {
+ {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
}
UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
return {};
@@ -1278,7 +1292,11 @@ private:
target = {GetRegister(gpr->GetIndex()), Type::Float};
} else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
- target = GetOutputAttribute(abuf);
+ auto output = GetOutputAttribute(abuf);
+ if (!output) {
+ return {};
+ }
+ target = std::move(*output);
} else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
if (stage == ProgramType::Compute) {
LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
@@ -1286,6 +1304,11 @@ private:
target = {
fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
Type::Uint};
+ } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+ ASSERT(stage == ProgramType::Compute);
+ target = {
+ fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()),
+ Type::Uint};
} else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
const std::string real = Visit(gmem->GetRealAddress()).AsUint();
const std::string base = Visit(gmem->GetBaseAddress()).AsUint();
@@ -1934,8 +1957,7 @@ private:
Expression BallotThread(Operation operation) {
const std::string value = VisitOperand(operation, 0).AsBool();
if (!device.HasWarpIntrinsics()) {
- LOG_ERROR(Render_OpenGL,
- "Nvidia warp intrinsics are not available and its required by a shader");
+ LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
// Stub on non-Nvidia devices by simulating all threads voting the same as the active
// one.
return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint};
@@ -1946,8 +1968,7 @@ private:
Expression Vote(Operation operation, const char* func) {
const std::string value = VisitOperand(operation, 0).AsBool();
if (!device.HasWarpIntrinsics()) {
- LOG_ERROR(Render_OpenGL,
- "Nvidia vote intrinsics are not available and its required by a shader");
+ LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
// Stub with a warp size of one.
return {value, Type::Bool};
}
@@ -1964,15 +1985,54 @@ private:
Expression VoteEqual(Operation operation) {
if (!device.HasWarpIntrinsics()) {
- LOG_ERROR(Render_OpenGL,
- "Nvidia vote intrinsics are not available and its required by a shader");
- // We must return true here since a stub for a theoretical warp size of 1 will always
- // return an equal result for all its votes.
+ LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
+ // We must return true here since a stub for a theoretical warp size of 1.
+ // This will always return an equal result across all votes.
return {"true", Type::Bool};
}
return Vote(operation, "allThreadsEqualNV");
}
+ template <const std::string_view& func>
+ Expression Shuffle(Operation operation) {
+ const std::string value = VisitOperand(operation, 0).AsFloat();
+ if (!device.HasWarpIntrinsics()) {
+ LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader");
+ // On a "single-thread" device we are either on the same thread or out of bounds. Both
+ // cases return the passed value.
+ return {value, Type::Float};
+ }
+
+ const std::string index = VisitOperand(operation, 1).AsUint();
+ const std::string width = VisitOperand(operation, 2).AsUint();
+ return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float};
+ }
+
+ template <const std::string_view& func>
+ Expression InRangeShuffle(Operation operation) {
+ const std::string index = VisitOperand(operation, 0).AsUint();
+ const std::string width = VisitOperand(operation, 1).AsUint();
+ if (!device.HasWarpIntrinsics()) {
+ // On a "single-thread" device we are only in bounds when the requested index is 0.
+ return {fmt::format("({} == 0U)", index), Type::Bool};
+ }
+
+ const std::string in_range = code.GenerateTemporary();
+ code.AddLine("bool {};", in_range);
+ code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range);
+ return {in_range, Type::Bool};
+ }
+
+ struct Func final {
+ Func() = delete;
+ ~Func() = delete;
+
+ static constexpr std::string_view ShuffleIndexed = "shuffleNV";
+ static constexpr std::string_view ShuffleUp = "shuffleUpNV";
+ static constexpr std::string_view ShuffleDown = "shuffleDownNV";
+ static constexpr std::string_view ShuffleButterfly = "shuffleXorNV";
+ };
+
static constexpr std::array operation_decompilers = {
&GLSLDecompiler::Assign,
@@ -2135,6 +2195,16 @@ private:
&GLSLDecompiler::VoteAll,
&GLSLDecompiler::VoteAny,
&GLSLDecompiler::VoteEqual,
+
+ &GLSLDecompiler::Shuffle<Func::ShuffleIndexed>,
+ &GLSLDecompiler::Shuffle<Func::ShuffleUp>,
+ &GLSLDecompiler::Shuffle<Func::ShuffleDown>,
+ &GLSLDecompiler::Shuffle<Func::ShuffleButterfly>,
+
+ &GLSLDecompiler::InRangeShuffle<Func::ShuffleIndexed>,
+ &GLSLDecompiler::InRangeShuffle<Func::ShuffleUp>,
+ &GLSLDecompiler::InRangeShuffle<Func::ShuffleDown>,
+ &GLSLDecompiler::InRangeShuffle<Func::ShuffleButterfly>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2175,6 +2245,10 @@ private:
return "lmem_" + suffix;
}
+ std::string GetSharedMemory() const {
+ return fmt::format("smem_{}", suffix);
+ }
+
std::string GetInternalFlag(InternalFlag flag) const {
constexpr std::array InternalFlagNames = {"zero_flag", "sign_flag", "carry_flag",
"overflow_flag"};
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ea77dd211..9ed738171 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -145,7 +145,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
case Tegra::Texture::TextureMipmapFilter::None:
return GL_LINEAR;
case Tegra::Texture::TextureMipmapFilter::Nearest:
- return GL_NEAREST_MIPMAP_LINEAR;
+ return GL_LINEAR_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
return GL_LINEAR_MIPMAP_LINEAR;
}
@@ -157,7 +157,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
case Tegra::Texture::TextureMipmapFilter::Nearest:
return GL_NEAREST_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
- return GL_LINEAR_MIPMAP_NEAREST;
+ return GL_NEAREST_MIPMAP_LINEAR;
}
}
}
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index b9153934e..f7fbbb6e4 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1127,6 +1127,46 @@ private:
return {};
}
+ Id ShuffleIndexed(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id ShuffleUp(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id ShuffleDown(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id ShuffleButterfly(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id InRangeShuffleIndexed(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id InRangeShuffleUp(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id InRangeShuffleDown(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
+ Id InRangeShuffleButterfly(Operation) {
+ UNIMPLEMENTED();
+ return {};
+ }
+
Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
const std::string& name) {
const Id id = OpVariable(type, storage);
@@ -1431,6 +1471,16 @@ private:
&SPIRVDecompiler::VoteAll,
&SPIRVDecompiler::VoteAny,
&SPIRVDecompiler::VoteEqual,
+
+ &SPIRVDecompiler::ShuffleIndexed,
+ &SPIRVDecompiler::ShuffleUp,
+ &SPIRVDecompiler::ShuffleDown,
+ &SPIRVDecompiler::ShuffleButterfly,
+
+ &SPIRVDecompiler::InRangeShuffleIndexed,
+ &SPIRVDecompiler::InRangeShuffleUp,
+ &SPIRVDecompiler::InRangeShuffleDown,
+ &SPIRVDecompiler::InRangeShuffleButterfly,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index c8c1a7f40..b73f6536e 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -138,6 +138,35 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
SetRegister(bb, instr.gpr0, value);
break;
}
+ case OpCode::Id::ICMP_CR:
+ case OpCode::Id::ICMP_R:
+ case OpCode::Id::ICMP_RC:
+ case OpCode::Id::ICMP_IMM: {
+ const Node zero = Immediate(0);
+
+ const auto [op_b, test] = [&]() -> std::pair<Node, Node> {
+ switch (opcode->get().GetId()) {
+ case OpCode::Id::ICMP_CR:
+ return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+ GetRegister(instr.gpr39)};
+ case OpCode::Id::ICMP_R:
+ return {GetRegister(instr.gpr20), GetRegister(instr.gpr39)};
+ case OpCode::Id::ICMP_RC:
+ return {GetRegister(instr.gpr39),
+ GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset)};
+ case OpCode::Id::ICMP_IMM:
+ return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)};
+ default:
+ UNREACHABLE();
+ return {zero, zero};
+ }
+ }();
+ const Node op_a = GetRegister(instr.gpr8);
+ const Node comparison =
+ GetPredicateComparisonInteger(instr.icmp.cond, instr.icmp.is_signed != 0, test, zero);
+ SetRegister(bb, instr.gpr0, Operation(OperationCode::Select, comparison, op_a, op_b));
+ break;
+ }
case OpCode::Id::LOP_C:
case OpCode::Id::LOP_R:
case OpCode::Id::LOP_IMM: {
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ed108bea8..7923d4d69 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -35,7 +35,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
return 1;
}
}
-} // namespace
+} // Anonymous namespace
u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
@@ -106,16 +106,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
break;
}
- case OpCode::Id::LD_L: {
- LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
- static_cast<u64>(instr.ld_l.unknown.Value()));
-
- const auto GetLmem = [&](s32 offset) {
+ case OpCode::Id::LD_L:
+ LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown));
+ [[fallthrough]];
+ case OpCode::Id::LD_S: {
+ const auto GetMemory = [&](s32 offset) {
ASSERT(offset % 4 == 0);
const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
immediate_offset);
- return GetLocalMemory(address);
+ return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address)
+ : GetLocalMemory(address);
};
switch (instr.ldst_sl.type.Value()) {
@@ -135,14 +136,16 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
return 0;
}
}();
- for (u32 i = 0; i < count; ++i)
- SetTemporary(bb, i, GetLmem(i * 4));
- for (u32 i = 0; i < count; ++i)
+ for (u32 i = 0; i < count; ++i) {
+ SetTemporary(bb, i, GetMemory(i * 4));
+ }
+ for (u32 i = 0; i < count; ++i) {
SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
+ }
break;
}
default:
- UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",
+ UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(),
static_cast<u32>(instr.ldst_sl.type.Value()));
}
break;
@@ -209,27 +212,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
break;
}
- case OpCode::Id::ST_L: {
+ case OpCode::Id::ST_L:
LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
static_cast<u64>(instr.st_l.cache_management.Value()));
-
- const auto GetLmemAddr = [&](s32 offset) {
+ [[fallthrough]];
+ case OpCode::Id::ST_S: {
+ const auto GetAddress = [&](s32 offset) {
ASSERT(offset % 4 == 0);
const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);
return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
};
+ const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L
+ ? &ShaderIR::SetLocalMemory
+ : &ShaderIR::SetSharedMemory;
+
switch (instr.ldst_sl.type.Value()) {
case Tegra::Shader::StoreType::Bits128:
- SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));
- SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));
+ (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3));
+ (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2));
+ [[fallthrough]];
case Tegra::Shader::StoreType::Bits64:
- SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));
+ (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1));
+ [[fallthrough]];
case Tegra::Shader::StoreType::Bits32:
- SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));
+ (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0));
break;
default:
- UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",
+ UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(),
static_cast<u32>(instr.ldst_sl.type.Value()));
}
break;
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
index 04ca74f46..a8e481b3c 100644
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@@ -13,6 +13,7 @@ namespace VideoCommon::Shader {
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
using Tegra::Shader::Pred;
+using Tegra::Shader::ShuffleOperation;
using Tegra::Shader::VoteOperation;
namespace {
@@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
SetPredicate(bb, instr.vote.dest_pred, vote);
break;
}
+ case OpCode::Id::SHFL: {
+ Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm))
+ : GetRegister(instr.gpr39);
+ Node width = [&] {
+ // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has
+ // been done reversing Nvidia's math. It won't work on all cases due to SHFL having
+ // different parameters that don't properly map to GLSL's interface, but it should work
+ // for cases emitted by Nvidia's compiler.
+ if (instr.shfl.operation == ShuffleOperation::Up) {
+ return Operation(
+ OperationCode::ILogicalShiftRight,
+ Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)),
+ Immediate(8));
+ } else {
+ return Operation(OperationCode::ILogicalShiftRight,
+ Operation(OperationCode::IAdd, Immediate(0x201F),
+ Operation(OperationCode::INegate, std::move(mask))),
+ Immediate(8));
+ }
+ }();
+
+ const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> {
+ switch (instr.shfl.operation) {
+ case ShuffleOperation::Idx:
+ return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed};
+ case ShuffleOperation::Up:
+ return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp};
+ case ShuffleOperation::Down:
+ return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown};
+ case ShuffleOperation::Bfly:
+ return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly};
+ }
+ UNREACHABLE_MSG("Invalid SHFL operation: {}",
+ static_cast<u64>(instr.shfl.operation.Value()));
+ return {};
+ }();
+
+ // Setting the predicate before the register is intentional to avoid overwriting.
+ Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm))
+ : GetRegister(instr.gpr20);
+ SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width));
+ SetRegister(
+ bb, instr.gpr0,
+ Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width)));
+ break;
+ }
default:
UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
break;
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index b47b201cf..abf2cb1ab 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -181,6 +181,16 @@ enum class OperationCode {
VoteAny, /// (bool) -> bool
VoteEqual, /// (bool) -> bool
+ ShuffleIndexed, /// (uint value, uint index, uint width) -> uint
+ ShuffleUp, /// (uint value, uint index, uint width) -> uint
+ ShuffleDown, /// (uint value, uint index, uint width) -> uint
+ ShuffleButterfly, /// (uint value, uint index, uint width) -> uint
+
+ InRangeShuffleIndexed, /// (uint index, uint width) -> bool
+ InRangeShuffleUp, /// (uint index, uint width) -> bool
+ InRangeShuffleDown, /// (uint index, uint width) -> bool
+ InRangeShuffleButterfly, /// (uint index, uint width) -> bool
+
Amount,
};
@@ -206,12 +216,13 @@ class PredicateNode;
class AbufNode;
class CbufNode;
class LmemNode;
+class SmemNode;
class GmemNode;
class CommentNode;
using NodeData =
std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
- PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
+ PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>;
using Node = std::shared_ptr<NodeData>;
using Node4 = std::array<Node, 4>;
using NodeBlock = std::vector<Node>;
@@ -583,6 +594,19 @@ private:
Node address;
};
+/// Shared memory node
+class SmemNode final {
+public:
+ explicit SmemNode(Node address) : address{std::move(address)} {}
+
+ const Node& GetAddress() const {
+ return address;
+ }
+
+private:
+ Node address;
+};
+
/// Global memory node
class GmemNode final {
public:
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 1e5c7f660..bbbab0bca 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -137,6 +137,10 @@ Node ShaderIR::GetLocalMemory(Node address) {
return MakeNode<LmemNode>(std::move(address));
}
+Node ShaderIR::GetSharedMemory(Node address) {
+ return MakeNode<SmemNode>(std::move(address));
+}
+
Node ShaderIR::GetTemporary(u32 id) {
return GetRegister(Register::ZeroIndex + 1 + id);
}
@@ -378,6 +382,11 @@ void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
}
+void ShaderIR::SetSharedMemory(NodeBlock& bb, Node address, Node value) {
+ bb.push_back(
+ Operation(OperationCode::Assign, GetSharedMemory(std::move(address)), std::move(value)));
+}
+
void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
}
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 62816bd56..6aed9bb84 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -208,6 +208,8 @@ private:
Node GetInternalFlag(InternalFlag flag, bool negated = false);
/// Generates a node representing a local memory address
Node GetLocalMemory(Node address);
+ /// Generates a node representing a shared memory address
+ Node GetSharedMemory(Node address);
/// Generates a temporary, internally it uses a post-RZ register
Node GetTemporary(u32 id);
@@ -217,8 +219,10 @@ private:
void SetPredicate(NodeBlock& bb, u64 dest, Node src);
/// Sets an internal flag. src value must be a bool-evaluated node
void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
- /// Sets a local memory address. address and value must be a number-evaluated node
+ /// Sets a local memory address with a value.
void SetLocalMemory(NodeBlock& bb, Node address, Node value);
+ /// Sets a shared memory address with a value.
+ void SetSharedMemory(NodeBlock& bb, Node address, Node value);
/// Sets a temporary. Internally it uses a post-RZ register
void SetTemporary(NodeBlock& bb, u32 id, Node value);