diff options
Diffstat (limited to 'src/video_core')
43 files changed, 418 insertions, 157 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 367f30517..6d8955ca3 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -131,6 +131,8 @@ public: void DownloadMemory(VAddr cpu_addr, u64 size); + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer); + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); void DisableGraphicsUniformBuffer(size_t stage, u32 index); @@ -808,6 +810,8 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); + const bool is_accuracy_normal = + Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; u64 total_size_bytes = 0; @@ -819,6 +823,9 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { buffer.ForEachDownloadRangeAndClear( cpu_addr, size, [&](u64 range_offset, u64 range_size) { + if (is_accuracy_normal) { + return; + } const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -1417,10 +1424,8 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - const bool is_accuracy_high = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_async && !is_accuracy_high) { + if (!is_async) { return; } uncommitted_ranges.add(base_interval); @@ -1652,6 +1657,42 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, } template <class P> +bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, + std::span<u8> inlined_buffer) { + const bool is_dirty = IsRegionRegistered(dest_address, copy_size); + if (!is_dirty) { + return false; + } + if (!IsRegionGpuModified(dest_address, copy_size)) { + return false; + } + + const IntervalType subtract_interval{dest_address, dest_address + copy_size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + + BufferId buffer_id = FindBuffer(dest_address, static_cast<u32>(copy_size)); + auto& buffer = slot_buffers[buffer_id]; + SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size)); + + if constexpr (USE_MEMORY_MAPS) { + std::array copies{BufferCopy{ + .src_offset = 0, + .dst_offset = buffer.Offset(dest_address), + .size = copy_size, + }}; + auto upload_staging = runtime.UploadStagingBuffer(copy_size); + u8* const src_pointer = upload_staging.mapped_span.data(); + std::memcpy(src_pointer, inlined_buffer.data(), copy_size); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); + } else { + buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size)); + } + + return true; +} + +template <class P> void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); } diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index 71d7e1473..351b110fe 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -7,6 +7,7 @@ #include "common/assert.h" #include "video_core/engines/engine_upload.h" #include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" #include "video_core/textures/decoders.h" namespace Tegra::Engines::Upload { @@ -16,6 +17,10 @@ State::State(MemoryManager& memory_manager_, Registers& regs_) State::~State() = default; +void State::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; +} + void State::ProcessExec(const bool is_linear_) { write_offset = 0; copy_size = regs.line_length_in * regs.line_count; @@ -32,7 +37,7 @@ void State::ProcessData(const u32 data, const bool is_last_call) { } const GPUVAddr address{regs.dest.Address()}; if (is_linear) { - memory_manager.WriteBlock(address, inner_buffer.data(), copy_size); + rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer); } else { UNIMPLEMENTED_IF(regs.dest.z != 0); UNIMPLEMENTED_IF(regs.dest.depth != 1); diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h index 1c7f1effa..c9c5ec8c3 100644 --- a/src/video_core/engines/engine_upload.h +++ b/src/video_core/engines/engine_upload.h @@ -12,6 +12,10 @@ namespace Tegra { class MemoryManager; } +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines::Upload { struct Registers { @@ -60,6 +64,9 @@ public: void ProcessExec(bool is_linear_); void ProcessData(u32 data, bool is_last_call); + /// Binds a rasterizer to this engine. + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); + private: u32 write_offset = 0; u32 copy_size = 0; @@ -68,6 +75,7 @@ private: bool is_linear = false; Registers& regs; MemoryManager& memory_manager; + VideoCore::RasterizerInterface* rasterizer = nullptr; }; } // namespace Tegra::Engines::Upload diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 492b4c5a3..5a1c12076 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -22,6 +22,7 @@ KeplerCompute::~KeplerCompute() = default; void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { rasterizer = rasterizer_; + upload_state.BindRasterizer(rasterizer); } void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) { diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 560551157..8aed16caa 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -19,6 +19,10 @@ KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager) KeplerMemory::~KeplerMemory() = default; +void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + upload_state.BindRasterizer(rasterizer_); +} + void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid KeplerMemory register, increase the size of the Regs structure"); diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 0d8ea09a9..949e2fae1 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -22,6 +22,10 @@ namespace Tegra { class MemoryManager; } +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines { /** @@ -38,6 +42,9 @@ public: explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager); ~KeplerMemory() override; + /// Binds a rasterizer to this engine. + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); + /// Write the value to the register identified by method. void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index c38ebd670..54a902f56 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -7,6 +7,7 @@ #include "common/assert.h" #include "core/core.h" #include "core/core_timing.h" +#include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" @@ -31,6 +32,7 @@ Maxwell3D::~Maxwell3D() = default; void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { rasterizer = rasterizer_; + upload_state.BindRasterizer(rasterizer_); } void Maxwell3D::InitializeRegisterDefaults() { @@ -194,7 +196,7 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 13: case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 14: case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: - return StartCBData(method); + return ProcessCBData(argument); case MAXWELL3D_REG_INDEX(cb_bind[0]): return ProcessCBBind(0); case MAXWELL3D_REG_INDEX(cb_bind[1]): @@ -207,6 +209,14 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume return ProcessCBBind(4); case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): return DrawArrays(); + case MAXWELL3D_REG_INDEX(small_index): + regs.index_array.count = regs.small_index.count; + regs.index_array.first = regs.small_index.first; + dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + return DrawArrays(); + case MAXWELL3D_REG_INDEX(topology_override): + use_topology_override = true; + return; case MAXWELL3D_REG_INDEX(clear_buffers): return ProcessClearBuffers(); case MAXWELL3D_REG_INDEX(query.query_get): @@ -247,14 +257,6 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) } void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { - if (method == cb_data_state.current) { - regs.reg_array[method] = method_argument; - ProcessCBData(method_argument); - return; - } else if (cb_data_state.current != null_cb_data) { - FinishCBData(); - } - // It is an error to write to a register other than the current macro's ARG register before it // has finished execution. if (executing_macro != 0) { @@ -301,7 +303,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 13: case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 14: case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: - ProcessCBMultiData(method, base_start, amount); + ProcessCBMultiData(base_start, amount); break; default: for (std::size_t i = 0; i < amount; i++) { @@ -359,6 +361,35 @@ void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) { } } +void Maxwell3D::ProcessTopologyOverride() { + using PrimitiveTopology = Maxwell3D::Regs::PrimitiveTopology; + using PrimitiveTopologyOverride = Maxwell3D::Regs::PrimitiveTopologyOverride; + + PrimitiveTopology topology{}; + + switch (regs.topology_override) { + case PrimitiveTopologyOverride::None: + topology = regs.draw.topology; + break; + case PrimitiveTopologyOverride::Points: + topology = PrimitiveTopology::Points; + break; + case PrimitiveTopologyOverride::Lines: + topology = PrimitiveTopology::Lines; + break; + case PrimitiveTopologyOverride::LineStrip: + topology = PrimitiveTopology::LineStrip; + break; + default: + topology = static_cast<PrimitiveTopology>(regs.topology_override); + break; + } + + if (use_topology_override) { + regs.draw.topology.Assign(topology); + } +} + void Maxwell3D::FlushMMEInlineDraw() { LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(), regs.vertex_buffer.count); @@ -369,6 +400,8 @@ void Maxwell3D::FlushMMEInlineDraw() { ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont, "Illegal combination of instancing parameters"); + ProcessTopologyOverride(); + const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed; if (ShouldExecute()) { rasterizer->Draw(is_indexed, true); @@ -528,6 +561,8 @@ void Maxwell3D::DrawArrays() { ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont, "Illegal combination of instancing parameters"); + ProcessTopologyOverride(); + if (regs.draw.instance_next) { // Increment the current instance *before* drawing. state.current_instance += 1; @@ -586,46 +621,7 @@ void Maxwell3D::ProcessCBBind(size_t stage_index) { rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size); } -void Maxwell3D::ProcessCBData(u32 value) { - const u32 id = cb_data_state.id; - cb_data_state.buffer[id][cb_data_state.counter] = value; - // Increment the current buffer position. - regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; - cb_data_state.counter++; -} - -void Maxwell3D::StartCBData(u32 method) { - constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data); - cb_data_state.start_pos = regs.const_buffer.cb_pos; - cb_data_state.id = method - first_cb_data; - cb_data_state.current = method; - cb_data_state.counter = 0; - ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]); -} - -void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) { - if (cb_data_state.current != method) { - if (cb_data_state.current != null_cb_data) { - FinishCBData(); - } - constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data); - cb_data_state.start_pos = regs.const_buffer.cb_pos; - cb_data_state.id = method - first_cb_data; - cb_data_state.current = method; - cb_data_state.counter = 0; - } - const std::size_t id = cb_data_state.id; - const std::size_t size = amount; - std::size_t i = 0; - for (; i < size; i++) { - cb_data_state.buffer[id][cb_data_state.counter] = start_base[i]; - cb_data_state.counter++; - } - // Increment the current buffer position. - regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount; -} - -void Maxwell3D::FinishCBData() { +void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) { // Write the input value to the current const buffer at the current position. const GPUVAddr buffer_address = regs.const_buffer.BufferAddress(); ASSERT(buffer_address != 0); @@ -633,14 +629,16 @@ void Maxwell3D::FinishCBData() { // Don't allow writing past the end of the buffer. ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size); - const GPUVAddr address{buffer_address + cb_data_state.start_pos}; - const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos; + const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos}; + const size_t copy_size = amount * sizeof(u32); + memory_manager.WriteBlock(address, start_base, copy_size); - const u32 id = cb_data_state.id; - memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); + // Increment the current buffer position. + regs.const_buffer.cb_pos += static_cast<u32>(copy_size); +} - cb_data_state.id = null_cb_data; - cb_data_state.current = null_cb_data; +void Maxwell3D::ProcessCBData(u32 value) { + ProcessCBMultiData(&value, 1); } Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index f22342dfb..357a74c70 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -367,6 +367,22 @@ public: Patches = 0xe, }; + // Constants as from NVC0_3D_UNK1970_D3D + // https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h#L1598 + enum class PrimitiveTopologyOverride : u32 { + None = 0x0, + Points = 0x1, + Lines = 0x2, + LineStrip = 0x3, + Triangles = 0x4, + TriangleStrip = 0x5, + LinesAdjacency = 0xa, + LineStripAdjacency = 0xb, + TrianglesAdjacency = 0xc, + TriangleStripAdjacency = 0xd, + Patches = 0xe, + }; + enum class IndexFormat : u32 { UnsignedByte = 0x0, UnsignedShort = 0x1, @@ -1200,7 +1216,12 @@ public: } } index_array; - INSERT_PADDING_WORDS_NOINIT(0x7); + union { + BitField<0, 16, u32> first; + BitField<16, 16, u32> count; + } small_index; + + INSERT_PADDING_WORDS_NOINIT(0x6); INSERT_PADDING_WORDS_NOINIT(0x1F); @@ -1244,7 +1265,11 @@ public: BitField<11, 1, u32> depth_clamp_disabled; } view_volume_clip_control; - INSERT_PADDING_WORDS_NOINIT(0x1F); + INSERT_PADDING_WORDS_NOINIT(0xC); + + PrimitiveTopologyOverride topology_override; + + INSERT_PADDING_WORDS_NOINIT(0x12); u32 depth_bounds_enable; @@ -1520,10 +1545,8 @@ private: void ProcessSyncPoint(); /// Handles a write to the CB_DATA[i] register. - void StartCBData(u32 method); void ProcessCBData(u32 value); - void ProcessCBMultiData(u32 method, const u32* start_base, u32 amount); - void FinishCBData(); + void ProcessCBMultiData(const u32* start_base, u32 amount); /// Handles a write to the CB_BIND register. void ProcessCBBind(size_t stage_index); @@ -1531,6 +1554,9 @@ private: /// Handles a write to the VERTEX_END_GL register, triggering a draw. void DrawArrays(); + /// Handles use of topology overrides (e.g., to avoid using a topology assigned from a macro) + void ProcessTopologyOverride(); + // Handles a instance drawcall from MME void StepInstance(MMEDrawMode expected_mode, u32 count); @@ -1555,19 +1581,10 @@ private: /// Interpreter for the macro codes uploaded to the GPU. std::unique_ptr<MacroEngine> macro_engine; - static constexpr u32 null_cb_data = 0xFFFFFFFF; - struct CBDataState { - std::array<std::array<u32, 0x4000>, 16> buffer; - u32 current{null_cb_data}; - u32 id{null_cb_data}; - u32 start_pos{}; - u32 counter{}; - }; - CBDataState cb_data_state; - Upload::State upload_state; bool execute_on{true}; + bool use_topology_override{false}; }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -1684,6 +1701,7 @@ ASSERT_REG_POSITION(draw, 0x585); ASSERT_REG_POSITION(primitive_restart, 0x591); ASSERT_REG_POSITION(provoking_vertex_last, 0x5A1); ASSERT_REG_POSITION(index_array, 0x5F2); +ASSERT_REG_POSITION(small_index, 0x5F9); ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); ASSERT_REG_POSITION(vp_point_size, 0x644); @@ -1693,6 +1711,7 @@ ASSERT_REG_POSITION(cull_face, 0x648); ASSERT_REG_POSITION(pixel_center_integer, 0x649); ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); ASSERT_REG_POSITION(view_volume_clip_control, 0x64F); +ASSERT_REG_POSITION(topology_override, 0x65C); ASSERT_REG_POSITION(depth_bounds_enable, 0x66F); ASSERT_REG_POSITION(logic_op, 0x671); ASSERT_REG_POSITION(clear_buffers, 0x674); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 67388d980..1fc1358bc 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -53,7 +53,6 @@ void MaxwellDMA::Launch() { // TODO(Subv): Perform more research and implement all features of this engine. const LaunchDMA& launch = regs.launch_dma; - ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE); ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); ASSERT(regs.dst_params.origin.x == 0); @@ -79,6 +78,7 @@ void MaxwellDMA::Launch() { CopyPitchToBlockLinear(); } } + ReleaseSemaphore(); } void MaxwellDMA::CopyPitchToPitch() { @@ -244,4 +244,22 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); } +void MaxwellDMA::ReleaseSemaphore() { + const auto type = regs.launch_dma.semaphore_type; + const GPUVAddr address = regs.semaphore.address; + switch (type) { + case LaunchDMA::SemaphoreType::NONE: + break; + case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: + memory_manager.Write<u32>(address, regs.semaphore.payload); + break; + case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: + memory_manager.Write<u64>(address, static_cast<u64>(regs.semaphore.payload)); + memory_manager.Write<u64>(address + 8, system.GPU().GetTicks()); + break; + default: + UNREACHABLE_MSG("Unknown semaphore type: {}", static_cast<u32>(type.Value())); + } +} + } // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index a04514425..2692cac8a 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -224,6 +224,8 @@ private: void FastCopyBlockLinearToPitch(); + void ReleaseSemaphore(); + Core::System& system; MemoryManager& memory_manager; diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 34dc6c596..f80d62c80 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -8,8 +8,6 @@ #include <queue> #include "common/common_types.h" -#include "common/settings.h" -#include "core/core.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 705765c99..ba9ba082f 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -59,6 +59,7 @@ struct GPU::Impl { maxwell_3d->BindRasterizer(rasterizer); fermi_2d->BindRasterizer(rasterizer); kepler_compute->BindRasterizer(rasterizer); + kepler_memory->BindRasterizer(rasterizer); maxwell_dma->BindRasterizer(rasterizer); } @@ -502,8 +503,13 @@ struct GPU::Impl { case BufferMethods::SemaphoreAddressHigh: case BufferMethods::SemaphoreAddressLow: case BufferMethods::SemaphoreSequence: + break; case BufferMethods::UnkCacheFlush: + rasterizer->SyncGuestHost(); + break; case BufferMethods::WrcacheFlush: + rasterizer->SignalReference(); + break; case BufferMethods::FenceValue: break; case BufferMethods::RefCnt: @@ -513,7 +519,7 @@ struct GPU::Impl { ProcessFenceActionMethod(); break; case BufferMethods::WaitForInterrupt: - ProcessWaitForInterruptMethod(); + rasterizer->WaitForIdle(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fd3e41434..af05d47d1 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -14,6 +14,7 @@ set(SHADER_FILES convert_d24s8_to_abgr8.frag convert_depth_to_float.frag convert_float_to_depth.frag + convert_s8d24_to_abgr8.frag full_screen_triangle.vert fxaa.frag fxaa.vert diff --git a/src/video_core/host_shaders/convert_s8d24_to_abgr8.frag b/src/video_core/host_shaders/convert_s8d24_to_abgr8.frag new file mode 100644 index 000000000..c8a1683b8 --- /dev/null +++ b/src/video_core/host_shaders/convert_s8d24_to_abgr8.frag @@ -0,0 +1,23 @@ +// Copyright 2022 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D depth_tex; +layout(binding = 1) uniform isampler2D stencil_tex; + +layout(location = 0) out vec4 color; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + uint depth = uint(textureLod(depth_tex, coord, 0).r * (exp2(24.0) - 1.0f)); + uint stencil = uint(textureLod(stencil_tex, coord, 0).r); + + highp uint depth_val = + uint(textureLod(depth_tex, coord, 0).r * (exp2(32.0) - 1.0)); + lowp uint stencil_val = textureLod(stencil_tex, coord, 0).r; + highp uvec4 components = + uvec4((uvec3(depth_val) >> uvec3(24u, 16u, 8u)) & 0x000000FFu, stencil_val); + color.rgba = vec4(components) / (exp2(8.0) - 1.0); +} diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 38d8d9d74..61bfe47c7 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -143,6 +143,8 @@ public: [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); void Unmap(GPUVAddr gpu_addr, std::size_t size); + void FlushRegion(GPUVAddr gpu_addr, size_t size) const; + private: [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); @@ -153,8 +155,6 @@ private: void TryLockPage(PageEntry page_entry, std::size_t size); void TryUnlockPage(PageEntry page_entry, std::size_t size); - void FlushRegion(GPUVAddr gpu_addr, size_t size) const; - void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, bool is_safe) const; void WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size, diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 392f82eb7..0173b54d8 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -18,7 +18,6 @@ #include "common/assert.h" #include "common/settings.h" -#include "core/core.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index b094fc064..1f1f12291 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -123,6 +123,9 @@ public: [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0; + virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, + std::span<u8> memory) = 0; + /// Attempt to use a faster method to display the framebuffer to screen [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index bb204454e..c5f974080 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -5,9 +5,10 @@ #pragma once #include <atomic> +#include <functional> #include <memory> -#include <optional> +#include "common/common_funcs.h" #include "common/common_types.h" #include "core/frontend/emu_window.h" #include "video_core/gpu.h" @@ -28,8 +29,11 @@ struct RendererSettings { Layout::FramebufferLayout screenshot_framebuffer_layout; }; -class RendererBase : NonCopyable { +class RendererBase { public: + YUZU_NON_COPYABLE(RendererBase); + YUZU_NON_MOVEABLE(RendererBase); + explicit RendererBase(Core::Frontend::EmuWindow& window, std::unique_ptr<Core::Frontend::GraphicsContext> context); virtual ~RendererBase(); diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 151290101..293ad7d59 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -31,9 +31,8 @@ bool GLInnerFence::IsSignaled() const { return true; } ASSERT(sync_object.handle != 0); - GLsizei length; GLint sync_status; - glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status); + glGetSynciv(sync_object.handle, GL_SYNC_STATUS, 1, nullptr, &sync_status); return sync_status == GL_SIGNALED; } diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index f8495896c..9e6732abd 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -243,10 +243,6 @@ GraphicsPipeline::GraphicsPipeline( case Settings::ShaderBackend::GLASM: if (!sources[stage].empty()) { assembly_programs[stage] = CompileProgram(sources[stage], AssemblyStage(stage)); - if (in_parallel) { - // Make sure program is built before continuing when building in parallel - glGetString(GL_PROGRAM_ERROR_STRING_NV); - } } break; case Settings::ShaderBackend::SPIRV: @@ -256,20 +252,18 @@ GraphicsPipeline::GraphicsPipeline( break; } } - if (in_parallel && backend != Settings::ShaderBackend::GLASM) { - // Make sure programs have built if we are building shaders in parallel - for (OGLProgram& program : source_programs) { - if (program.handle != 0) { - GLint status{}; - glGetProgramiv(program.handle, GL_LINK_STATUS, &status); - } - } + if (in_parallel) { + std::lock_guard lock{built_mutex}; + built_fence.Create(); + // Flush this context to ensure compilation commands and fence are in the GPU pipe. + glFlush(); + built_condvar.notify_one(); + } else { + is_built = true; } if (shader_notify) { shader_notify->MarkShaderComplete(); } - is_built = true; - built_condvar.notify_one(); }}; if (thread_worker) { thread_worker->QueueWork(std::move(func)); @@ -440,7 +434,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { buffer_cache.UpdateGraphicsBuffers(is_indexed); buffer_cache.BindHostGeometryBuffers(is_indexed); - if (!is_built.load(std::memory_order::relaxed)) { + if (!IsBuilt()) { WaitForBuild(); } const bool use_assembly{assembly_programs[0].handle != 0}; @@ -585,8 +579,26 @@ void GraphicsPipeline::GenerateTransformFeedbackState() { } void GraphicsPipeline::WaitForBuild() { - std::unique_lock lock{built_mutex}; - built_condvar.wait(lock, [this] { return is_built.load(std::memory_order::relaxed); }); + if (built_fence.handle == 0) { + std::unique_lock lock{built_mutex}; + built_condvar.wait(lock, [this] { return built_fence.handle != 0; }); + } + ASSERT(glClientWaitSync(built_fence.handle, 0, GL_TIMEOUT_IGNORED) != GL_WAIT_FAILED); + is_built = true; +} + +bool GraphicsPipeline::IsBuilt() noexcept { + if (is_built) { + return true; + } + if (built_fence.handle == 0) { + return false; + } + // Timeout of zero means this is non-blocking + const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0); + ASSERT(sync_status != GL_WAIT_FAILED); + is_built = sync_status != GL_TIMEOUT_EXPIRED; + return is_built; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 4e28d9a42..311d49f3f 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -100,9 +100,7 @@ public: return writes_global_memory; } - [[nodiscard]] bool IsBuilt() const noexcept { - return is_built.load(std::memory_order::relaxed); - } + [[nodiscard]] bool IsBuilt() noexcept; template <typename Spec> static auto MakeConfigureSpecFunc() { @@ -154,7 +152,8 @@ private: std::mutex built_mutex; std::condition_variable built_condvar; - std::atomic_bool is_built{false}; + OGLSync built_fence{}; + bool is_built{false}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 9b516c64f..142412a8e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -484,6 +484,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() return accelerate_dma; } +void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, + std::span<u8> memory) { + auto cpu_addr = gpu_memory.GpuToCpuAddress(address); + if (!cpu_addr) [[unlikely]] { + gpu_memory.WriteBlock(address, memory.data(), copy_size); + return; + } + gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size); + { + std::unique_lock<std::mutex> lock{buffer_cache.mutex}; + if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) { + buffer_cache.WriteMemory(*cpu_addr, copy_size); + } + } + { + std::scoped_lock lock_texture{texture_cache.mutex}; + texture_cache.WriteMemory(*cpu_addr, copy_size); + } + shader_cache.InvalidateRegion(*cpu_addr, copy_size); + query_cache.InvalidateRegion(*cpu_addr, copy_size); +} + bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { if (framebuffer_addr == 0) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index d0397b745..98f6fd342 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -106,6 +106,8 @@ public: const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; + void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, + std::span<u8> memory) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; void LoadDiskResources(u64 title_id, std::stop_token stop_loading, diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index b2d5bfd3b..84e07f8bd 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -7,12 +7,14 @@ #include <string_view> #include <utility> #include <glad/glad.h> -#include "common/common_types.h" +#include "common/common_funcs.h" namespace OpenGL { -class OGLRenderbuffer : private NonCopyable { +class OGLRenderbuffer final { public: + YUZU_NON_COPYABLE(OGLRenderbuffer); + OGLRenderbuffer() = default; OGLRenderbuffer(OGLRenderbuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -36,8 +38,10 @@ public: GLuint handle = 0; }; -class OGLTexture : private NonCopyable { +class OGLTexture final { public: + YUZU_NON_COPYABLE(OGLTexture); + OGLTexture() = default; OGLTexture(OGLTexture&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -61,8 +65,10 @@ public: GLuint handle = 0; }; -class OGLTextureView : private NonCopyable { +class OGLTextureView final { public: + YUZU_NON_COPYABLE(OGLTextureView); + OGLTextureView() = default; OGLTextureView(OGLTextureView&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -86,8 +92,10 @@ public: GLuint handle = 0; }; -class OGLSampler : private NonCopyable { +class OGLSampler final { public: + YUZU_NON_COPYABLE(OGLSampler); + OGLSampler() = default; OGLSampler(OGLSampler&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -111,8 +119,10 @@ public: GLuint handle = 0; }; -class OGLShader : private NonCopyable { +class OGLShader final { public: + YUZU_NON_COPYABLE(OGLShader); + OGLShader() = default; OGLShader(OGLShader&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -132,8 +142,10 @@ public: GLuint handle = 0; }; -class OGLProgram : private NonCopyable { +class OGLProgram final { public: + YUZU_NON_COPYABLE(OGLProgram); + OGLProgram() = default; OGLProgram(OGLProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -154,8 +166,10 @@ public: GLuint handle = 0; }; -class OGLAssemblyProgram : private NonCopyable { +class OGLAssemblyProgram final { public: + YUZU_NON_COPYABLE(OGLAssemblyProgram); + OGLAssemblyProgram() = default; OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -176,8 +190,10 @@ public: GLuint handle = 0; }; -class OGLPipeline : private NonCopyable { +class OGLPipeline final { public: + YUZU_NON_COPYABLE(OGLPipeline); + OGLPipeline() = default; OGLPipeline(OGLPipeline&& o) noexcept : handle{std::exchange<GLuint>(o.handle, 0)} {} @@ -198,8 +214,10 @@ public: GLuint handle = 0; }; -class OGLBuffer : private NonCopyable { +class OGLBuffer final { public: + YUZU_NON_COPYABLE(OGLBuffer); + OGLBuffer() = default; OGLBuffer(OGLBuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -223,8 +241,10 @@ public: GLuint handle = 0; }; -class OGLSync : private NonCopyable { +class OGLSync final { public: + YUZU_NON_COPYABLE(OGLSync); + OGLSync() = default; OGLSync(OGLSync&& o) noexcept : handle(std::exchange(o.handle, nullptr)) {} @@ -247,8 +267,10 @@ public: GLsync handle = 0; }; -class OGLFramebuffer : private NonCopyable { +class OGLFramebuffer final { public: + YUZU_NON_COPYABLE(OGLFramebuffer); + OGLFramebuffer() = default; OGLFramebuffer(OGLFramebuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {} @@ -272,8 +294,10 @@ public: GLuint handle = 0; }; -class OGLQuery : private NonCopyable { +class OGLQuery final { public: + YUZU_NON_COPYABLE(OGLQuery); + OGLQuery() = default; OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {} diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 5864c7c07..550ed6d36 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -9,7 +9,6 @@ #include <glad/glad.h> #include "common/common_types.h" -#include "core/core.h" #include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index daba42ed9..db5bf1d30 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -184,6 +184,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32: case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; + case Maxwell::VertexAttribute::Size::Size_11_11_10: + return GL_UNSIGNED_INT_10F_11F_11F_REV; default: break; } diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index 2c3914459..ec03cca38 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -9,6 +9,7 @@ #include "video_core/host_shaders/convert_d24s8_to_abgr8_frag_spv.h" #include "video_core/host_shaders/convert_depth_to_float_frag_spv.h" #include "video_core/host_shaders/convert_float_to_depth_frag_spv.h" +#include "video_core/host_shaders/convert_s8d24_to_abgr8_frag_spv.h" #include "video_core/host_shaders/full_screen_triangle_vert_spv.h" #include "video_core/host_shaders/vulkan_blit_color_float_frag_spv.h" #include "video_core/host_shaders/vulkan_blit_depth_stencil_frag_spv.h" @@ -370,6 +371,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, VKScheduler& scheduler_, convert_float_to_depth_frag(BuildShader(device, CONVERT_FLOAT_TO_DEPTH_FRAG_SPV)), convert_abgr8_to_d24s8_frag(BuildShader(device, CONVERT_ABGR8_TO_D24S8_FRAG_SPV)), convert_d24s8_to_abgr8_frag(BuildShader(device, CONVERT_D24S8_TO_ABGR8_FRAG_SPV)), + convert_s8d24_to_abgr8_frag(BuildShader(device, CONVERT_S8D24_TO_ABGR8_FRAG_SPV)), linear_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_LINEAR>)), nearest_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_NEAREST>)) { if (device.IsExtShaderStencilExportSupported()) { @@ -474,6 +476,13 @@ void BlitImageHelper::ConvertD24S8ToABGR8(const Framebuffer* dst_framebuffer, ConvertDepthStencil(*convert_d24s8_to_abgr8_pipeline, dst_framebuffer, src_image_view); } +void BlitImageHelper::ConvertS8D24ToABGR8(const Framebuffer* dst_framebuffer, + ImageView& src_image_view) { + ConvertPipelineColorTargetEx(convert_s8d24_to_abgr8_pipeline, dst_framebuffer->RenderPass(), + convert_s8d24_to_abgr8_frag); + ConvertDepthStencil(*convert_s8d24_to_abgr8_pipeline, dst_framebuffer, src_image_view); +} + void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, const ImageView& src_image_view) { const VkPipelineLayout layout = *one_texture_pipeline_layout; diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h index 85e7dca5b..1a3944179 100644 --- a/src/video_core/renderer_vulkan/blit_image.h +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -56,6 +56,8 @@ public: void ConvertD24S8ToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view); + void ConvertS8D24ToABGR8(const Framebuffer* dst_framebuffer, ImageView& src_image_view); + private: void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, const ImageView& src_image_view); @@ -99,6 +101,7 @@ private: vk::ShaderModule convert_float_to_depth_frag; vk::ShaderModule convert_abgr8_to_d24s8_frag; vk::ShaderModule convert_d24s8_to_abgr8_frag; + vk::ShaderModule convert_s8d24_to_abgr8_frag; vk::Sampler linear_sampler; vk::Sampler nearest_sampler; @@ -112,6 +115,7 @@ private: vk::Pipeline convert_r16_to_d16_pipeline; vk::Pipeline convert_abgr8_to_d24s8_pipeline; vk::Pipeline convert_d24s8_to_abgr8_pipeline; + vk::Pipeline convert_s8d24_to_abgr8_pipeline; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 751e4792b..1c136c410 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -495,6 +495,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_SFLOAT; + case Maxwell::VertexAttribute::Size::Size_11_11_10: + return VK_FORMAT_B10G11R11_UFLOAT_PACK32; default: break; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index c71a1f44d..621a6a071 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -100,6 +100,8 @@ VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) { return VK_FORMAT_A8B8G8R8_UNORM_PACK32; case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM: return VK_FORMAT_R5G6B5_UNORM_PACK16; + case Tegra::FramebufferConfig::PixelFormat::B8G8R8A8_UNORM: + return VK_FORMAT_B8G8R8A8_UNORM; default: UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", static_cast<u32>(framebuffer.pixel_format)); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 3e96c0f60..4d73427b4 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> #include <cstring> #include <memory> #include <optional> @@ -292,7 +293,7 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, }; - const std::array push_constants{base_vertex, index_shift}; + const std::array<u32, 2> push_constants{base_vertex, index_shift}; const VkDescriptorSet set = descriptor_allocator.Commit(); device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fd334a146..2227d9197 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -548,6 +548,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() return accelerate_dma; } +void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, + std::span<u8> memory) { + auto cpu_addr = gpu_memory.GpuToCpuAddress(address); + if (!cpu_addr) [[unlikely]] { + gpu_memory.WriteBlock(address, memory.data(), copy_size); + return; + } + gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size); + { + std::unique_lock<std::mutex> lock{buffer_cache.mutex}; + if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) { + buffer_cache.WriteMemory(*cpu_addr, copy_size); + } + } + { + std::scoped_lock lock_texture{texture_cache.mutex}; + texture_cache.WriteMemory(*cpu_addr, copy_size); + } + pipeline_cache.InvalidateRegion(*cpu_addr, copy_size); + query_cache.InvalidateRegion(*cpu_addr, copy_size); +} + bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { if (!framebuffer_addr) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 866827247..5af2e275b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -99,6 +99,8 @@ public: const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; + void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size, + std::span<u8> memory) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; void LoadDiskResources(u64 title_id, std::stop_token stop_loading, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 3bfdf41ba..7d9d4f7ba 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -140,12 +140,12 @@ bool VKScheduler::UpdateRescaling(bool is_rescaling) { void VKScheduler::WorkerThread(std::stop_token stop_token) { Common::SetCurrentThreadName("yuzu:VulkanWorker"); do { - if (work_queue.empty()) { - wait_cv.notify_all(); - } std::unique_ptr<CommandChunk> work; { std::unique_lock lock{work_mutex}; + if (work_queue.empty()) { + wait_cv.notify_all(); + } work_cv.wait(lock, stop_token, [this] { return !work_queue.empty(); }); if (stop_token.stop_requested()) { continue; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 1b06c9296..e69aa136b 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -146,6 +146,7 @@ private: using FuncType = TypedCommand<T>; static_assert(sizeof(FuncType) < sizeof(data), "Lambda is too large"); + recorded_counts++; command_offset = Common::AlignUp(command_offset, alignof(FuncType)); if (command_offset > sizeof(data) - sizeof(FuncType)) { return false; @@ -167,7 +168,7 @@ private: } bool Empty() const { - return command_offset == 0; + return recorded_counts == 0; } bool HasSubmit() const { @@ -178,6 +179,7 @@ private: Command* first = nullptr; Command* last = nullptr; + size_t recorded_counts = 0; size_t command_offset = 0; bool submit = false; alignas(std::max_align_t) std::array<u8, 0x8000> data{}; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 40a149832..8240c83e1 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -8,7 +8,6 @@ #include <limits> #include "common/common_types.h" -#include "core/core.h" #include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 0ba56ff1e..83a23b66a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -554,10 +554,12 @@ void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage im }; } -[[nodiscard]] bool IsFormatFlipped(PixelFormat format) { +[[nodiscard]] bool IsFormatFlipped(PixelFormat format, bool emulate_bgr565) { switch (format) { case PixelFormat::A1B5G5R5_UNORM: return true; + case PixelFormat::B5G6R5_UNORM: + return emulate_bgr565; default: return false; } @@ -779,11 +781,6 @@ bool TextureCacheRuntime::ShouldReinterpret(Image& dst, Image& src) { !device.IsExtShaderStencilExportSupported()) { return true; } - if (VideoCore::Surface::GetFormatType(src.info.format) == - VideoCore::Surface::SurfaceType::DepthStencil && - !device.IsExtShaderStencilExportSupported()) { - return true; - } if (dst.info.format == PixelFormat::D32_FLOAT_S8_UINT || src.info.format == PixelFormat::D32_FLOAT_S8_UINT) { return true; @@ -1068,6 +1065,9 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im if (src_view.format == PixelFormat::S8_UINT_D24_UNORM) { return blit_image_helper.ConvertD24S8ToABGR8(dst, src_view); } + if (src_view.format == PixelFormat::D24_UNORM_S8_UINT) { + return blit_image_helper.ConvertS8D24ToABGR8(dst, src_view); + } break; case PixelFormat::R32_FLOAT: if (src_view.format == PixelFormat::D32_FLOAT) { @@ -1488,7 +1488,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI }; if (!info.IsRenderTarget()) { swizzle = info.Swizzle(); - if (IsFormatFlipped(format)) { + if (IsFormatFlipped(format, device->MustEmulateBGR565())) { std::ranges::transform(swizzle, swizzle.begin(), SwapBlueRed); } if ((aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) != 0) { diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index 78bf90c48..87636857d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -170,7 +170,7 @@ void ShaderCache::RemovePendingShaders() { marked_for_removal.clear(); if (!removed_shaders.empty()) { - RemoveShadersFromStorage(std::move(removed_shaders)); + RemoveShadersFromStorage(removed_shaders); } } @@ -213,7 +213,7 @@ void ShaderCache::UnmarkMemory(Entry* entry) { rasterizer.UpdatePagesCachedCount(addr, size, -1); } -void ShaderCache::RemoveShadersFromStorage(std::vector<ShaderInfo*> removed_shaders) { +void ShaderCache::RemoveShadersFromStorage(std::span<ShaderInfo*> removed_shaders) { // Remove them from the cache std::erase_if(storage, [&removed_shaders](const std::unique_ptr<ShaderInfo>& shader) { return std::ranges::find(removed_shaders, shader.get()) != removed_shaders.end(); diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h index 136fe294c..8836bc8c6 100644 --- a/src/video_core/shader_cache.h +++ b/src/video_core/shader_cache.h @@ -4,7 +4,6 @@ #pragma once -#include <algorithm> #include <array> #include <memory> #include <mutex> @@ -138,7 +137,7 @@ private: /// @param removed_shaders Shaders to be removed from the storage /// @pre invalidation_mutex is locked /// @pre lookup_mutex is locked - void RemoveShadersFromStorage(std::vector<ShaderInfo*> removed_shaders); + void RemoveShadersFromStorage(std::span<ShaderInfo*> removed_shaders); /// @brief Creates a new entry in the lookup cache and returns its pointer /// @pre lookup_mutex is locked diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 198bb0cfb..72eeb8bbd 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -343,7 +343,7 @@ template <bool has_blacklists> void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table, std::span<ImageViewId> cached_image_view_ids, std::span<ImageViewInOut> views) { - bool has_blacklisted; + bool has_blacklisted = false; do { has_deleted_images = false; if constexpr (has_blacklists) { @@ -1725,7 +1725,7 @@ void TextureCache<P>::SynchronizeAliases(ImageId image_id) { }); const auto& resolution = Settings::values.resolution_info; for (const AliasedImage* const aliased : aliased_images) { - if (!resolution.active | !any_rescaled) { + if (!resolution.active || !any_rescaled) { CopyImage(image_id, aliased->id, aliased->copies); continue; } @@ -1736,19 +1736,7 @@ void TextureCache<P>::SynchronizeAliases(ImageId image_id) { continue; } ScaleUp(aliased_image); - - const bool both_2d{image.info.type == ImageType::e2D && - aliased_image.info.type == ImageType::e2D}; - auto copies = aliased->copies; - for (auto copy : copies) { - copy.extent.width = std::max<u32>( - (copy.extent.width * resolution.up_scale) >> resolution.down_shift, 1); - if (both_2d) { - copy.extent.height = std::max<u32>( - (copy.extent.height * resolution.up_scale) >> resolution.down_shift, 1); - } - } - CopyImage(image_id, aliased->id, copies); + CopyImage(image_id, aliased->id, aliased->copies); } } diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 329bf4def..2f2594585 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -50,6 +50,7 @@ std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor gpu->BindRenderer(std::move(renderer)); return gpu; } catch (const std::runtime_error& exception) { + scope.Cancel(); LOG_ERROR(HW_GPU, "Failed to initialize GPU: {}", exception.what()); return nullptr; } diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 153702c0b..effde73c9 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -39,6 +39,11 @@ constexpr std::array DEPTH16_UNORM_STENCIL8_UINT{ VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_UNDEFINED, }; + +constexpr std::array B5G6R5_UNORM_PACK16{ + VK_FORMAT_R5G6B5_UNORM_PACK16, + VK_FORMAT_UNDEFINED, +}; } // namespace Alternatives enum class NvidiaArchitecture { @@ -87,6 +92,8 @@ constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { return Alternatives::DEPTH24_UNORM_STENCIL8_UINT.data(); case VK_FORMAT_D16_UNORM_S8_UINT: return Alternatives::DEPTH16_UNORM_STENCIL8_UINT.data(); + case VK_FORMAT_B5G6R5_UNORM_PACK16: + return Alternatives::B5G6R5_UNORM_PACK16.data(); default: return nullptr; } @@ -224,9 +231,14 @@ std::vector<std::string> GetSupportedExtensions(vk::PhysicalDevice physical) { return supported_extensions; } +bool IsExtensionSupported(std::span<const std::string> supported_extensions, + std::string_view extension) { + return std::ranges::find(supported_extensions, extension) != supported_extensions.end(); +} + NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, std::span<const std::string> exts) { - if (std::ranges::find(exts, VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME) != exts.end()) { + if (IsExtensionSupported(exts, VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME)) { VkPhysicalDeviceFragmentShadingRatePropertiesKHR shading_rate_props{}; shading_rate_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR; @@ -239,7 +251,7 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, return NvidiaArchitecture::AmpereOrNewer; } } - if (std::ranges::find(exts, VK_NV_SHADING_RATE_IMAGE_EXTENSION_NAME) != exts.end()) { + if (IsExtensionSupported(exts, VK_NV_SHADING_RATE_IMAGE_EXTENSION_NAME)) { return NvidiaArchitecture::Turing; } return NvidiaArchitecture::VoltaOrOlder; @@ -604,7 +616,8 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR break; } } - if (ext_extended_dynamic_state && driver_id == VK_DRIVER_ID_MESA_RADV) { + const bool is_radv = driver_id == VK_DRIVER_ID_MESA_RADV; + if (ext_extended_dynamic_state && is_radv) { // Mask driver version variant const u32 version = (properties.driverVersion << 3) >> 3; if (version < VK_MAKE_API_VERSION(0, 21, 2, 0)) { @@ -613,6 +626,17 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR ext_extended_dynamic_state = false; } } + if (ext_vertex_input_dynamic_state && is_radv) { + // TODO(ameerj): Blacklist only offending driver versions + // TODO(ameerj): Confirm if RDNA1 is affected + const bool is_rdna2 = + IsExtensionSupported(supported_extensions, VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME); + if (is_rdna2) { + LOG_WARNING(Render_Vulkan, + "RADV has broken VK_EXT_vertex_input_dynamic_state on RDNA2 hardware"); + ext_vertex_input_dynamic_state = false; + } + } sets_per_pool = 64; const bool is_amd = @@ -628,7 +652,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR has_broken_cube_compatibility = true; } } - const bool is_amd_or_radv = is_amd || driver_id == VK_DRIVER_ID_MESA_RADV; + const bool is_amd_or_radv = is_amd || is_radv; if (ext_sampler_filter_minmax && is_amd_or_radv) { // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. if (!is_float16_supported) { @@ -639,6 +663,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR } const bool is_intel_windows = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS; + const bool is_intel_anv = driver_id == VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA; if (ext_vertex_input_dynamic_state && is_intel_windows) { LOG_WARNING(Render_Vulkan, "Blacklisting Intel for VK_EXT_vertex_input_dynamic_state"); ext_vertex_input_dynamic_state = false; @@ -652,6 +677,10 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR LOG_WARNING(Render_Vulkan, "Intel proprietary drivers do not support MSAA image blits"); cant_blit_msaa = true; } + if (is_intel_anv) { + LOG_WARNING(Render_Vulkan, "ANV driver does not support native BGR format"); + must_emulate_bgr565 = true; + } supports_d24_depth = IsFormatSupported(VK_FORMAT_D24_UNORM_S8_UINT, diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 37d140ebd..34b1add16 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -354,6 +354,10 @@ public: return cant_blit_msaa; } + bool MustEmulateBGR565() const { + return must_emulate_bgr565; + } + private: /// Checks if the physical device is suitable. void CheckSuitability(bool requires_swapchain) const; @@ -448,6 +452,7 @@ private: bool has_nsight_graphics{}; ///< Has Nsight Graphics attached bool supports_d24_depth{}; ///< Supports D24 depth buffers. bool cant_blit_msaa{}; ///< Does not support MSAA<->MSAA blitting. + bool must_emulate_bgr565{}; ///< Emulates BGR565 by swizzling RGB565 format. // Telemetry parameters std::string vendor_name; ///< Device's driver name. |