summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h7
-rw-r--r--src/video_core/cdma_pusher.cpp1
-rw-r--r--src/video_core/cdma_pusher.h2
-rw-r--r--src/video_core/command_classes/codecs/h264.cpp7
-rw-r--r--src/video_core/command_classes/vic.cpp259
-rw-r--r--src/video_core/command_classes/vic.h20
-rw-r--r--src/video_core/engines/maxwell_3d.h1
-rw-r--r--src/video_core/engines/maxwell_dma.cpp64
-rw-r--r--src/video_core/engines/maxwell_dma.h2
-rw-r--r--src/video_core/framebuffer_config.h20
-rw-r--r--src/video_core/gpu.cpp1223
-rw-r--r--src/video_core/gpu.h230
-rw-r--r--src/video_core/gpu_thread.cpp57
-rw-r--r--src/video_core/gpu_thread.h16
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/opengl_copy_bgra.comp15
-rw-r--r--src/video_core/query_cache.h1
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp38
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h23
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h4
-rw-r--r--src/video_core/renderer_opengl/util_shaders.cpp76
-rw-r--r--src/video_core/renderer_opengl/util_shaders.h22
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.cpp15
-rw-r--r--src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp16
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp19
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h7
-rw-r--r--src/video_core/renderer_vulkan/vk_swapchain.cpp29
-rw-r--r--src/video_core/renderer_vulkan/vk_swapchain.h11
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h2
-rw-r--r--src/video_core/shader_environment.cpp1
-rw-r--r--src/video_core/shader_environment.h4
-rw-r--r--src/video_core/texture_cache/image_view_info.cpp1
-rw-r--r--src/video_core/texture_cache/slot_vector.h4
-rw-r--r--src/video_core/texture_cache/texture_cache.h5
-rw-r--r--src/video_core/texture_cache/texture_cache_base.h8
-rw-r--r--src/video_core/vulkan_common/vulkan_device.cpp25
-rw-r--r--src/video_core/vulkan_common/vulkan_device.h6
39 files changed, 1216 insertions, 1032 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 7bfd57369..d350c9b36 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -570,13 +570,12 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
ForEachWrittenRange(*cpu_src_address, amount, mirror);
// This subtraction in this order is important for overlapping copies.
common_ranges.subtract(subtract_interval);
- bool atleast_1_download = tmp_intervals.size() != 0;
- for (const IntervalType add_interval : tmp_intervals) {
+ const bool has_new_downloads = tmp_intervals.size() != 0;
+ for (const IntervalType& add_interval : tmp_intervals) {
common_ranges.add(add_interval);
}
-
runtime.CopyBuffer(dest_buffer, src_buffer, copies);
- if (atleast_1_download) {
+ if (has_new_downloads) {
dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
}
std::vector<u8> tmp_buffer(amount);
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
index 8b86ad050..a8c4b4415 100644
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -24,6 +24,7 @@
#include "command_classes/vic.h"
#include "video_core/cdma_pusher.h"
#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/command_classes/sync_manager.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index 1bada44dd..87b49d6ea 100644
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -9,13 +9,13 @@
#include "common/bit_field.h"
#include "common/common_types.h"
-#include "video_core/command_classes/sync_manager.h"
namespace Tegra {
class GPU;
class Host1x;
class Nvdec;
+class SyncptIncrManager;
class Vic;
enum class ChSubmissionMode : u32 {
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
index 51ee14c13..5519c4705 100644
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -20,6 +20,8 @@
#include <array>
#include <bit>
+
+#include "common/settings.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
@@ -96,7 +98,10 @@ const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegister
(context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
// TODO (ameerj): Where do we get this number, it seems to be particular for each stream
- writer.WriteUe(6); // Max number of reference frames
+ const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
+ const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU;
+ const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
+ writer.WriteUe(max_num_ref_frames);
writer.WriteBit(false);
writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
writer.WriteUe(pic_height - 1);
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 0ee07f398..051616124 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -16,6 +16,7 @@ extern "C" {
}
#include "common/assert.h"
+#include "common/bit_field.h"
#include "common/logging/log.h"
#include "video_core/command_classes/nvdec.h"
@@ -26,6 +27,25 @@ extern "C" {
#include "video_core/textures/decoders.h"
namespace Tegra {
+namespace {
+enum class VideoPixelFormat : u64_le {
+ RGBA8 = 0x1f,
+ BGRA8 = 0x20,
+ RGBX8 = 0x23,
+ YUV420 = 0x44,
+};
+} // Anonymous namespace
+
+union VicConfig {
+ u64_le raw{};
+ BitField<0, 7, VideoPixelFormat> pixel_format;
+ BitField<7, 2, u64_le> chroma_loc_horiz;
+ BitField<9, 2, u64_le> chroma_loc_vert;
+ BitField<11, 4, u64_le> block_linear_kind;
+ BitField<15, 4, u64_le> block_linear_height_log2;
+ BitField<32, 14, u64_le> surface_width_minus1;
+ BitField<46, 14, u64_le> surface_height_minus1;
+};
Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
: gpu(gpu_),
@@ -65,134 +85,155 @@ void Vic::Execute() {
if (!frame) {
return;
}
- const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value());
- switch (pixel_format) {
+ const u64 surface_width = config.surface_width_minus1 + 1;
+ const u64 surface_height = config.surface_height_minus1 + 1;
+ if (static_cast<u64>(frame->width) != surface_width ||
+ static_cast<u64>(frame->height) != surface_height) {
+ // TODO: Properly support multiple video streams with differing frame dimensions
+ LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}",
+ frame->width, frame->height, surface_width, surface_height);
+ }
+ switch (config.pixel_format) {
+ case VideoPixelFormat::RGBA8:
case VideoPixelFormat::BGRA8:
- case VideoPixelFormat::RGBA8: {
- LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+ case VideoPixelFormat::RGBX8:
+ WriteRGBFrame(frame, config);
+ break;
+ case VideoPixelFormat::YUV420:
+ WriteYUVFrame(frame, config);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value());
+ break;
+ }
+}
- if (scaler_ctx == nullptr || frame->width != scaler_width ||
- frame->height != scaler_height) {
- const AVPixelFormat target_format =
- (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
+ LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+
+ if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) {
+ const AVPixelFormat target_format = [pixel_format = config.pixel_format]() {
+ switch (pixel_format) {
+ case VideoPixelFormat::RGBA8:
+ return AV_PIX_FMT_RGBA;
+ case VideoPixelFormat::BGRA8:
+ return AV_PIX_FMT_BGRA;
+ case VideoPixelFormat::RGBX8:
+ return AV_PIX_FMT_RGB0;
+ default:
+ return AV_PIX_FMT_RGBA;
+ }
+ }();
+
+ sws_freeContext(scaler_ctx);
+ // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format
+ scaler_ctx = sws_getContext(frame->width, frame->height,
+ static_cast<AVPixelFormat>(frame->format), frame->width,
+ frame->height, target_format, 0, nullptr, nullptr, nullptr);
+ scaler_width = frame->width;
+ scaler_height = frame->height;
+ converted_frame_buffer.reset();
+ }
+ if (!converted_frame_buffer) {
+ const size_t frame_size = frame->width * frame->height * 4;
+ converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free};
+ }
+ const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
+ u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+ sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr,
+ converted_stride.data());
+
+ // Use the minimum of surface/frame dimensions to avoid buffer overflow.
+ const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1;
+ const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1;
+ const u32 width = std::min(surface_width, static_cast<u32>(frame->width));
+ const u32 height = std::min(surface_height, static_cast<u32>(frame->height));
+ const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+ if (blk_kind != 0) {
+ // swizzle pitch linear to block linear
+ const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+ const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
+ luma_buffer.resize(size);
+ Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
+ converted_frame_buf_addr, block_height, 0, 0);
+
+ gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
+ } else {
+ // send pitch linear frame
+ const size_t linear_size = width * height * 4;
+ gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+ linear_size);
+ }
+}
- sws_freeContext(scaler_ctx);
- scaler_ctx = nullptr;
+void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
+ LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
- // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format
- scaler_ctx = sws_getContext(frame->width, frame->height,
- static_cast<AVPixelFormat>(frame->format), frame->width,
- frame->height, target_format, 0, nullptr, nullptr, nullptr);
+ const std::size_t surface_width = config.surface_width_minus1 + 1;
+ const std::size_t surface_height = config.surface_height_minus1 + 1;
+ const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
+ // Use the minimum of surface/frame dimensions to avoid buffer overflow.
+ const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
+ const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
- scaler_width = frame->width;
- scaler_height = frame->height;
- }
- // Get Converted frame
- const u32 width = static_cast<u32>(frame->width);
- const u32 height = static_cast<u32>(frame->height);
- const std::size_t linear_size = width * height * 4;
-
- // Only allocate frame_buffer once per stream, as the size is not expected to change
- if (!converted_frame_buffer) {
- converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free};
+ const auto stride = static_cast<size_t>(frame->linesize[0]);
+
+ luma_buffer.resize(aligned_width * surface_height);
+ chroma_buffer.resize(aligned_width * surface_height / 2);
+
+ // Populate luma buffer
+ const u8* luma_src = frame->data[0];
+ for (std::size_t y = 0; y < frame_height; ++y) {
+ const std::size_t src = y * stride;
+ const std::size_t dst = y * aligned_width;
+ for (std::size_t x = 0; x < frame_width; ++x) {
+ luma_buffer[dst + x] = luma_src[src + x];
}
- const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
- u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
-
- sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
- &converted_frame_buf_addr, converted_stride.data());
-
- const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
- if (blk_kind != 0) {
- // swizzle pitch linear to block linear
- const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
- const auto size =
- Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
- luma_buffer.resize(size);
- Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
- converted_frame_buffer.get(), block_height, 0, 0);
-
- gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
- } else {
- // send pitch linear frame
- gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
- linear_size);
+ }
+ gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+ luma_buffer.size());
+
+ // Chroma
+ const std::size_t half_height = frame_height / 2;
+ const auto half_stride = static_cast<size_t>(frame->linesize[1]);
+
+ switch (frame->format) {
+ case AV_PIX_FMT_YUV420P: {
+ // Frame from FFmpeg software
+ // Populate chroma buffer from both channels with interleaving.
+ const std::size_t half_width = frame_width / 2;
+ const u8* chroma_b_src = frame->data[1];
+ const u8* chroma_r_src = frame->data[2];
+ for (std::size_t y = 0; y < half_height; ++y) {
+ const std::size_t src = y * half_stride;
+ const std::size_t dst = y * aligned_width;
+
+ for (std::size_t x = 0; x < half_width; ++x) {
+ chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
+ chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
+ }
}
break;
}
- case VideoPixelFormat::Yuv420: {
- LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
-
- const std::size_t surface_width = config.surface_width_minus1 + 1;
- const std::size_t surface_height = config.surface_height_minus1 + 1;
- const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
- const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
- const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
-
- const auto stride = static_cast<size_t>(frame->linesize[0]);
-
- luma_buffer.resize(aligned_width * surface_height);
- chroma_buffer.resize(aligned_width * surface_height / 2);
-
- // Populate luma buffer
- const u8* luma_src = frame->data[0];
- for (std::size_t y = 0; y < frame_height; ++y) {
+ case AV_PIX_FMT_NV12: {
+ // Frame from VA-API hardware
+ // This is already interleaved so just copy
+ const u8* chroma_src = frame->data[1];
+ for (std::size_t y = 0; y < half_height; ++y) {
const std::size_t src = y * stride;
const std::size_t dst = y * aligned_width;
for (std::size_t x = 0; x < frame_width; ++x) {
- luma_buffer[dst + x] = luma_src[src + x];
- }
- }
- gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
- luma_buffer.size());
-
- // Chroma
- const std::size_t half_height = frame_height / 2;
- const auto half_stride = static_cast<size_t>(frame->linesize[1]);
-
- switch (frame->format) {
- case AV_PIX_FMT_YUV420P: {
- // Frame from FFmpeg software
- // Populate chroma buffer from both channels with interleaving.
- const std::size_t half_width = frame_width / 2;
- const u8* chroma_b_src = frame->data[1];
- const u8* chroma_r_src = frame->data[2];
- for (std::size_t y = 0; y < half_height; ++y) {
- const std::size_t src = y * half_stride;
- const std::size_t dst = y * aligned_width;
-
- for (std::size_t x = 0; x < half_width; ++x) {
- chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
- chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
- }
+ chroma_buffer[dst + x] = chroma_src[src + x];
}
- break;
- }
- case AV_PIX_FMT_NV12: {
- // Frame from VA-API hardware
- // This is already interleaved so just copy
- const u8* chroma_src = frame->data[1];
- for (std::size_t y = 0; y < half_height; ++y) {
- const std::size_t src = y * stride;
- const std::size_t dst = y * aligned_width;
- for (std::size_t x = 0; x < frame_width; ++x) {
- chroma_buffer[dst + x] = chroma_src[src + x];
- }
- }
- break;
- }
- default:
- UNREACHABLE();
- break;
}
- gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
- chroma_buffer.size());
break;
}
default:
- UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+ UNREACHABLE();
break;
}
+ gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
+ chroma_buffer.size());
}
} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
index 74246e08c..6d4cdfd57 100644
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -6,7 +6,6 @@
#include <memory>
#include <vector>
-#include "common/bit_field.h"
#include "common/common_types.h"
struct SwsContext;
@@ -14,6 +13,7 @@ struct SwsContext;
namespace Tegra {
class GPU;
class Nvdec;
+union VicConfig;
class Vic {
public:
@@ -27,6 +27,7 @@ public:
};
explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+
~Vic();
/// Write to the device state.
@@ -35,22 +36,9 @@ public:
private:
void Execute();
- enum class VideoPixelFormat : u64_le {
- RGBA8 = 0x1f,
- BGRA8 = 0x20,
- Yuv420 = 0x44,
- };
+ void WriteRGBFrame(const AVFrame* frame, const VicConfig& config);
- union VicConfig {
- u64_le raw{};
- BitField<0, 7, u64_le> pixel_format;
- BitField<7, 2, u64_le> chroma_loc_horiz;
- BitField<9, 2, u64_le> chroma_loc_vert;
- BitField<11, 4, u64_le> block_linear_kind;
- BitField<15, 4, u64_le> block_linear_height_log2;
- BitField<32, 14, u64_le> surface_width_minus1;
- BitField<46, 14, u64_le> surface_height_minus1;
- };
+ void WriteYUVFrame(const AVFrame* frame, const VicConfig& config);
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 7f4ca6282..f22342dfb 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
#include <array>
#include <bitset>
+#include <cmath>
#include <limits>
#include <optional>
#include <type_traits>
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index c7ec1eac9..67388d980 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -82,41 +82,41 @@ void MaxwellDMA::Launch() {
}
void MaxwellDMA::CopyPitchToPitch() {
- // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
- // buffer of length `line_length_in`.
- // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
- auto& accelerate = rasterizer->AccessAccelerateDMA();
- if (!regs.launch_dma.multi_line_enable) {
- const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 &&
- regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
- // TODO: allow multisized components.
- if (is_buffer_clear) {
- ASSERT(regs.remap_const.component_size_minus_one == 3);
- accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
- std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
- memory_manager.WriteBlockUnsafe(regs.offset_out,
- reinterpret_cast<u8*>(tmp_buffer.data()),
- regs.line_length_in * sizeof(u32));
- return;
- }
- UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
- if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
- std::vector<u8> tmp_buffer(regs.line_length_in);
- memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
- memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
+ // When `multi_line_enable` bit is enabled we copy a 2D image of dimensions
+ // (line_length_in, line_count).
+ // Otherwise the copy is performed as if we were copying a 1D buffer of length line_length_in.
+ const bool remap_enabled = regs.launch_dma.remap_enable != 0;
+ if (regs.launch_dma.multi_line_enable) {
+ UNIMPLEMENTED_IF(remap_enabled);
+
+ // Perform a line-by-line copy.
+ // We're going to take a subrect of size (line_length_in, line_count) from the source
+ // rectangle. There is no need to manually flush/invalidate the regions because CopyBlock
+ // does that for us.
+ for (u32 line = 0; line < regs.line_count; ++line) {
+ const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+ const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+ memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
}
return;
}
-
- UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
-
- // Perform a line-by-line copy.
- // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
- // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
- for (u32 line = 0; line < regs.line_count; ++line) {
- const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
- const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
- memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+ // TODO: allow multisized components.
+ auto& accelerate = rasterizer->AccessAccelerateDMA();
+ const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
+ const bool is_buffer_clear = remap_enabled && is_const_a_dst;
+ if (is_buffer_clear) {
+ ASSERT(regs.remap_const.component_size_minus_one == 3);
+ accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
+ std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
+ memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()),
+ regs.line_length_in * sizeof(u32));
+ return;
+ }
+ UNIMPLEMENTED_IF(remap_enabled);
+ if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
+ std::vector<u8> tmp_buffer(regs.line_length_in);
+ memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
+ memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
}
}
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 9e457ae16..a04514425 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -175,7 +175,7 @@ public:
static_assert(sizeof(LaunchDMA) == 4);
struct RemapConst {
- enum Swizzle : u32 {
+ enum class Swizzle : u32 {
SRC_X = 0,
SRC_Y = 1,
SRC_Z = 2,
diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h
index b86c3a757..b1d455e30 100644
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -4,8 +4,10 @@
#pragma once
-namespace Tegra {
+#include "common/common_types.h"
+#include "common/math_util.h"
+namespace Tegra {
/**
* Struct describing framebuffer configuration
*/
@@ -16,6 +18,21 @@ struct FramebufferConfig {
B8G8R8A8_UNORM = 5,
};
+ enum class TransformFlags : u32 {
+ /// No transform flags are set
+ Unset = 0x00,
+ /// Flip source image horizontally (around the vertical axis)
+ FlipH = 0x01,
+ /// Flip source image vertically (around the horizontal axis)
+ FlipV = 0x02,
+ /// Rotate source image 90 degrees clockwise
+ Rotate90 = 0x04,
+ /// Rotate source image 180 degrees
+ Rotate180 = 0x03,
+ /// Rotate source image 270 degrees clockwise
+ Rotate270 = 0x07,
+ };
+
VAddr address{};
u32 offset{};
u32 width{};
@@ -23,7 +40,6 @@ struct FramebufferConfig {
u32 stride{};
PixelFormat pixel_format{};
- using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
TransformFlags transform_flags{};
Common::Rectangle<int> crop_rect;
};
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index ff024f530..ab7c21a49 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,548 +2,913 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <array>
+#include <atomic>
#include <chrono>
+#include <condition_variable>
+#include <list>
+#include <memory>
#include "common/assert.h"
#include "common/microprofile.h"
#include "common/settings.h"
#include "core/core.h"
#include "core/core_timing.h"
-#include "core/core_timing_util.h"
#include "core/frontend/emu_window.h"
#include "core/hardware_interrupt_manager.h"
-#include "core/memory.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvflinger/buffer_queue.h"
#include "core/perf_stats.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/dma_pusher.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/kepler_memory.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h"
#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
#include "video_core/shader_notify.h"
-#include "video_core/video_core.h"
namespace Tegra {
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
-GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
- : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
- dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_},
- maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
- fermi_2d{std::make_unique<Engines::Fermi2D>()},
- kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
- maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
- kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
- shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
- gpu_thread{system_, is_async_} {}
+struct GPU::Impl {
+ explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
+ : gpu{gpu_}, system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(
+ system)},
+ dma_pusher{std::make_unique<Tegra::DmaPusher>(system, gpu)}, use_nvdec{use_nvdec_},
+ maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
+ fermi_2d{std::make_unique<Engines::Fermi2D>()},
+ kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
+ maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
+ kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
+ shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
+ gpu_thread{system_, is_async_} {}
+
+ ~Impl() = default;
+
+ /// Binds a renderer to the GPU.
+ void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
+ renderer = std::move(renderer_);
+ rasterizer = renderer->ReadRasterizer();
+
+ memory_manager->BindRasterizer(rasterizer);
+ maxwell_3d->BindRasterizer(rasterizer);
+ fermi_2d->BindRasterizer(rasterizer);
+ kepler_compute->BindRasterizer(rasterizer);
+ maxwell_dma->BindRasterizer(rasterizer);
+ }
-GPU::~GPU() = default;
+ /// Calls a GPU method.
+ void CallMethod(const GPU::MethodCall& method_call) {
+ LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
+ method_call.subchannel);
+
+ ASSERT(method_call.subchannel < bound_engines.size());
+
+ if (ExecuteMethodOnEngine(method_call.method)) {
+ CallEngineMethod(method_call);
+ } else {
+ CallPullerMethod(method_call);
+ }
+ }
+
+ /// Calls a GPU multivalue method.
+ void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+ u32 methods_pending) {
+ LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
+
+ ASSERT(subchannel < bound_engines.size());
+
+ if (ExecuteMethodOnEngine(method)) {
+ CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
+ } else {
+ for (std::size_t i = 0; i < amount; i++) {
+ CallPullerMethod(GPU::MethodCall{
+ method,
+ base_start[i],
+ subchannel,
+ methods_pending - static_cast<u32>(i),
+ });
+ }
+ }
+ }
+
+ /// Flush all current written commands into the host GPU for execution.
+ void FlushCommands() {
+ rasterizer->FlushCommands();
+ }
+
+ /// Synchronizes CPU writes with Host GPU memory.
+ void SyncGuestHost() {
+ rasterizer->SyncGuestHost();
+ }
+
+ /// Signal the ending of command list.
+ void OnCommandListEnd() {
+ if (is_async) {
+ // This command only applies to asynchronous GPU mode
+ gpu_thread.OnCommandListEnd();
+ }
+ }
+
+ /// Request a host GPU memory flush from the CPU.
+ [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) {
+ std::unique_lock lck{flush_request_mutex};
+ const u64 fence = ++last_flush_fence;
+ flush_requests.emplace_back(fence, addr, size);
+ return fence;
+ }
+
+ /// Obtains current flush request fence id.
+ [[nodiscard]] u64 CurrentFlushRequestFence() const {
+ return current_flush_fence.load(std::memory_order_relaxed);
+ }
+
+ /// Tick pending requests within the GPU.
+ void TickWork() {
+ std::unique_lock lck{flush_request_mutex};
+ while (!flush_requests.empty()) {
+ auto& request = flush_requests.front();
+ const u64 fence = request.fence;
+ const VAddr addr = request.addr;
+ const std::size_t size = request.size;
+ flush_requests.pop_front();
+ flush_request_mutex.unlock();
+ rasterizer->FlushRegion(addr, size);
+ current_flush_fence.store(fence);
+ flush_request_mutex.lock();
+ }
+ }
+
+ /// Returns a reference to the Maxwell3D GPU engine.
+ [[nodiscard]] Engines::Maxwell3D& Maxwell3D() {
+ return *maxwell_3d;
+ }
+
+ /// Returns a const reference to the Maxwell3D GPU engine.
+ [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const {
+ return *maxwell_3d;
+ }
+
+ /// Returns a reference to the KeplerCompute GPU engine.
+ [[nodiscard]] Engines::KeplerCompute& KeplerCompute() {
+ return *kepler_compute;
+ }
+
+ /// Returns a reference to the KeplerCompute GPU engine.
+ [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const {
+ return *kepler_compute;
+ }
+
+ /// Returns a reference to the GPU memory manager.
+ [[nodiscard]] Tegra::MemoryManager& MemoryManager() {
+ return *memory_manager;
+ }
+
+ /// Returns a const reference to the GPU memory manager.
+ [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const {
+ return *memory_manager;
+ }
+
+ /// Returns a reference to the GPU DMA pusher.
+ [[nodiscard]] Tegra::DmaPusher& DmaPusher() {
+ return *dma_pusher;
+ }
+
+ /// Returns a const reference to the GPU DMA pusher.
+ [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const {
+ return *dma_pusher;
+ }
+
+ /// Returns a reference to the GPU CDMA pusher.
+ [[nodiscard]] Tegra::CDmaPusher& CDmaPusher() {
+ return *cdma_pusher;
+ }
+
+ /// Returns a const reference to the GPU CDMA pusher.
+ [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const {
+ return *cdma_pusher;
+ }
+
+ /// Returns a reference to the underlying renderer.
+ [[nodiscard]] VideoCore::RendererBase& Renderer() {
+ return *renderer;
+ }
+
+ /// Returns a const reference to the underlying renderer.
+ [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
+ return *renderer;
+ }
+
+ /// Returns a reference to the shader notifier.
+ [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
+ return *shader_notify;
+ }
+
+ /// Returns a const reference to the shader notifier.
+ [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
+ return *shader_notify;
+ }
+
+ /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
+ void WaitFence(u32 syncpoint_id, u32 value) {
+ // Synced GPU, is always in sync
+ if (!is_async) {
+ return;
+ }
+ if (syncpoint_id == UINT32_MAX) {
+ // TODO: Research what this does.
+ LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
+ return;
+ }
+ MICROPROFILE_SCOPE(GPU_wait);
+ std::unique_lock lock{sync_mutex};
+ sync_cv.wait(lock, [=, this] {
+ if (shutting_down.load(std::memory_order_relaxed)) {
+ // We're shutting down, ensure no threads continue to wait for the next syncpoint
+ return true;
+ }
+ return syncpoints.at(syncpoint_id).load() >= value;
+ });
+ }
+
+ void IncrementSyncPoint(u32 syncpoint_id) {
+ auto& syncpoint = syncpoints.at(syncpoint_id);
+ syncpoint++;
+ std::lock_guard lock{sync_mutex};
+ sync_cv.notify_all();
+ auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+ if (!interrupt.empty()) {
+ u32 value = syncpoint.load();
+ auto it = interrupt.begin();
+ while (it != interrupt.end()) {
+ if (value >= *it) {
+ TriggerCpuInterrupt(syncpoint_id, *it);
+ it = interrupt.erase(it);
+ continue;
+ }
+ it++;
+ }
+ }
+ }
+
+ [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const {
+ return syncpoints.at(syncpoint_id).load();
+ }
+
+ void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) {
+ std::lock_guard lock{sync_mutex};
+ auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+ bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+ [value](u32 in_value) { return in_value == value; });
+ if (contains) {
+ return;
+ }
+ interrupt.emplace_back(value);
+ }
+
+ [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) {
+ std::lock_guard lock{sync_mutex};
+ auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+ const auto iter =
+ std::find_if(interrupt.begin(), interrupt.end(),
+ [value](u32 interrupt_value) { return value == interrupt_value; });
+
+ if (iter == interrupt.end()) {
+ return false;
+ }
+ interrupt.erase(iter);
+ return true;
+ }
+
+ [[nodiscard]] u64 GetTicks() const {
+ // This values were reversed engineered by fincs from NVN
+ // The gpu clock is reported in units of 385/625 nanoseconds
+ constexpr u64 gpu_ticks_num = 384;
+ constexpr u64 gpu_ticks_den = 625;
+
+ u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+ if (Settings::values.use_fast_gpu_time.GetValue()) {
+ nanoseconds /= 256;
+ }
+ const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
+ const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
+ return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+ }
+
+ [[nodiscard]] bool IsAsync() const {
+ return is_async;
+ }
+
+ [[nodiscard]] bool UseNvdec() const {
+ return use_nvdec;
+ }
+
+ void RendererFrameEndNotify() {
+ system.GetPerfStats().EndGameFrame();
+ }
+
+ /// Performs any additional setup necessary in order to begin GPU emulation.
+ /// This can be used to launch any necessary threads and register any necessary
+ /// core timing events.
+ void Start() {
+ gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
+ cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+ cpu_context->MakeCurrent();
+ }
+
+ /// Obtain the CPU Context
+ void ObtainContext() {
+ cpu_context->MakeCurrent();
+ }
+
+ /// Release the CPU Context
+ void ReleaseContext() {
+ cpu_context->DoneCurrent();
+ }
+
+ /// Push GPU command entries to be processed
+ void PushGPUEntries(Tegra::CommandList&& entries) {
+ gpu_thread.SubmitList(std::move(entries));
+ }
+
+ /// Push GPU command buffer entries to be processed
+ void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+ if (!use_nvdec) {
+ return;
+ }
+
+ if (!cdma_pusher) {
+ cdma_pusher = std::make_unique<Tegra::CDmaPusher>(gpu);
+ }
+
+ // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+ // TODO(ameerj): RE proper async nvdec operation
+ // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+ cdma_pusher->ProcessEntries(std::move(entries));
+ }
+
+ /// Frees the CDMAPusher instance to free up resources
+ void ClearCdmaInstance() {
+ cdma_pusher.reset();
+ }
+
+ /// Swap buffers (render frame)
+ void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+ gpu_thread.SwapBuffers(framebuffer);
+ }
+
+ /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+ void FlushRegion(VAddr addr, u64 size) {
+ gpu_thread.FlushRegion(addr, size);
+ }
+
+ /// Notify rasterizer that any caches of the specified region should be invalidated
+ void InvalidateRegion(VAddr addr, u64 size) {
+ gpu_thread.InvalidateRegion(addr, size);
+ }
+
+ /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+ void FlushAndInvalidateRegion(VAddr addr, u64 size) {
+ gpu_thread.FlushAndInvalidateRegion(addr, size);
+ }
+
+ void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const {
+ auto& interrupt_manager = system.InterruptManager();
+ interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+ }
+
+ void ProcessBindMethod(const GPU::MethodCall& method_call) {
+ // Bind the current subchannel to the desired engine id.
+ LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
+ method_call.argument);
+ const auto engine_id = static_cast<EngineID>(method_call.argument);
+ bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
+ switch (engine_id) {
+ case EngineID::FERMI_TWOD_A:
+ dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
+ break;
+ case EngineID::MAXWELL_B:
+ dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
+ break;
+ case EngineID::KEPLER_COMPUTE_B:
+ dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
+ break;
+ case EngineID::MAXWELL_DMA_COPY_A:
+ dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
+ break;
+ case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+ dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
+ }
+ }
-void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
- renderer = std::move(renderer_);
- rasterizer = renderer->ReadRasterizer();
+ void ProcessFenceActionMethod() {
+ switch (regs.fence_action.op) {
+ case GPU::FenceOperation::Acquire:
+ WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+ break;
+ case GPU::FenceOperation::Increment:
+ IncrementSyncPoint(regs.fence_action.syncpoint_id);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
+ }
+ }
+
+ void ProcessWaitForInterruptMethod() {
+ // TODO(bunnei) ImplementMe
+ LOG_WARNING(HW_GPU, "(STUBBED) called");
+ }
+
+ void ProcessSemaphoreTriggerMethod() {
+ const auto semaphoreOperationMask = 0xF;
+ const auto op =
+ static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
+ if (op == GpuSemaphoreOperation::WriteLong) {
+ struct Block {
+ u32 sequence;
+ u32 zeros = 0;
+ u64 timestamp;
+ };
+
+ Block block{};
+ block.sequence = regs.semaphore_sequence;
+ // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
+ // CoreTiming
+ block.timestamp = GetTicks();
+ memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
+ sizeof(block));
+ } else {
+ const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
+ if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
+ (op == GpuSemaphoreOperation::AcquireGequal &&
+ static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
+ (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
+ // Nothing to do in this case
+ } else {
+ regs.acquire_source = true;
+ regs.acquire_value = regs.semaphore_sequence;
+ if (op == GpuSemaphoreOperation::AcquireEqual) {
+ regs.acquire_active = true;
+ regs.acquire_mode = false;
+ } else if (op == GpuSemaphoreOperation::AcquireGequal) {
+ regs.acquire_active = true;
+ regs.acquire_mode = true;
+ } else if (op == GpuSemaphoreOperation::AcquireMask) {
+ // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
+ // semaphore_sequence, gives a non-0 result
+ LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
+ } else {
+ LOG_ERROR(HW_GPU, "Invalid semaphore operation");
+ }
+ }
+ }
+ }
+
+ void ProcessSemaphoreRelease() {
+ memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(),
+ regs.semaphore_release);
+ }
+
+ void ProcessSemaphoreAcquire() {
+ const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
+ const auto value = regs.semaphore_acquire;
+ if (word != value) {
+ regs.acquire_active = true;
+ regs.acquire_value = value;
+ // TODO(kemathe73) figure out how to do the acquire_timeout
+ regs.acquire_mode = false;
+ regs.acquire_source = false;
+ }
+ }
- memory_manager->BindRasterizer(rasterizer);
- maxwell_3d->BindRasterizer(rasterizer);
- fermi_2d->BindRasterizer(rasterizer);
- kepler_compute->BindRasterizer(rasterizer);
- maxwell_dma->BindRasterizer(rasterizer);
+ /// Calls a GPU puller method.
+ void CallPullerMethod(const GPU::MethodCall& method_call) {
+ regs.reg_array[method_call.method] = method_call.argument;
+ const auto method = static_cast<BufferMethods>(method_call.method);
+
+ switch (method) {
+ case BufferMethods::BindObject: {
+ ProcessBindMethod(method_call);
+ break;
+ }
+ case BufferMethods::Nop:
+ case BufferMethods::SemaphoreAddressHigh:
+ case BufferMethods::SemaphoreAddressLow:
+ case BufferMethods::SemaphoreSequence:
+ case BufferMethods::UnkCacheFlush:
+ case BufferMethods::WrcacheFlush:
+ case BufferMethods::FenceValue:
+ break;
+ case BufferMethods::RefCnt:
+ rasterizer->SignalReference();
+ break;
+ case BufferMethods::FenceAction:
+ ProcessFenceActionMethod();
+ break;
+ case BufferMethods::WaitForInterrupt:
+ ProcessWaitForInterruptMethod();
+ break;
+ case BufferMethods::SemaphoreTrigger: {
+ ProcessSemaphoreTriggerMethod();
+ break;
+ }
+ case BufferMethods::NotifyIntr: {
+ // TODO(Kmather73): Research and implement this method.
+ LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
+ break;
+ }
+ case BufferMethods::Unk28: {
+ // TODO(Kmather73): Research and implement this method.
+ LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
+ break;
+ }
+ case BufferMethods::SemaphoreAcquire: {
+ ProcessSemaphoreAcquire();
+ break;
+ }
+ case BufferMethods::SemaphoreRelease: {
+ ProcessSemaphoreRelease();
+ break;
+ }
+ case BufferMethods::Yield: {
+ // TODO(Kmather73): Research and implement this method.
+ LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
+ break;
+ }
+ default:
+ LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
+ break;
+ }
+ }
+
+ /// Calls a GPU engine method.
+ void CallEngineMethod(const GPU::MethodCall& method_call) {
+ const EngineID engine = bound_engines[method_call.subchannel];
+
+ switch (engine) {
+ case EngineID::FERMI_TWOD_A:
+ fermi_2d->CallMethod(method_call.method, method_call.argument,
+ method_call.IsLastCall());
+ break;
+ case EngineID::MAXWELL_B:
+ maxwell_3d->CallMethod(method_call.method, method_call.argument,
+ method_call.IsLastCall());
+ break;
+ case EngineID::KEPLER_COMPUTE_B:
+ kepler_compute->CallMethod(method_call.method, method_call.argument,
+ method_call.IsLastCall());
+ break;
+ case EngineID::MAXWELL_DMA_COPY_A:
+ maxwell_dma->CallMethod(method_call.method, method_call.argument,
+ method_call.IsLastCall());
+ break;
+ case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+ kepler_memory->CallMethod(method_call.method, method_call.argument,
+ method_call.IsLastCall());
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented engine");
+ }
+ }
+
+ /// Calls a GPU engine multivalue method.
+ void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+ u32 methods_pending) {
+ const EngineID engine = bound_engines[subchannel];
+
+ switch (engine) {
+ case EngineID::FERMI_TWOD_A:
+ fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
+ break;
+ case EngineID::MAXWELL_B:
+ maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
+ break;
+ case EngineID::KEPLER_COMPUTE_B:
+ kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
+ break;
+ case EngineID::MAXWELL_DMA_COPY_A:
+ maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
+ break;
+ case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+ kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Unimplemented engine");
+ }
+ }
+
+ /// Determines where the method should be executed.
+ [[nodiscard]] bool ExecuteMethodOnEngine(u32 method) {
+ const auto buffer_method = static_cast<BufferMethods>(method);
+ return buffer_method >= BufferMethods::NonPullerMethods;
+ }
+
+ struct Regs {
+ static constexpr size_t NUM_REGS = 0x40;
+
+ union {
+ struct {
+ INSERT_PADDING_WORDS_NOINIT(0x4);
+ struct {
+ u32 address_high;
+ u32 address_low;
+
+ [[nodiscard]] GPUVAddr SemaphoreAddress() const {
+ return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+ address_low);
+ }
+ } semaphore_address;
+
+ u32 semaphore_sequence;
+ u32 semaphore_trigger;
+ INSERT_PADDING_WORDS_NOINIT(0xC);
+
+ // The pusher and the puller share the reference counter, the pusher only has read
+ // access
+ u32 reference_count;
+ INSERT_PADDING_WORDS_NOINIT(0x5);
+
+ u32 semaphore_acquire;
+ u32 semaphore_release;
+ u32 fence_value;
+ GPU::FenceAction fence_action;
+ INSERT_PADDING_WORDS_NOINIT(0xE2);
+
+ // Puller state
+ u32 acquire_mode;
+ u32 acquire_source;
+ u32 acquire_active;
+ u32 acquire_timeout;
+ u32 acquire_value;
+ };
+ std::array<u32, NUM_REGS> reg_array;
+ };
+ } regs{};
+
+ GPU& gpu;
+ Core::System& system;
+ std::unique_ptr<Tegra::MemoryManager> memory_manager;
+ std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+ std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
+ std::unique_ptr<VideoCore::RendererBase> renderer;
+ VideoCore::RasterizerInterface* rasterizer = nullptr;
+ const bool use_nvdec;
+
+ /// Mapping of command subchannels to their bound engine ids
+ std::array<EngineID, 8> bound_engines{};
+ /// 3D engine
+ std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
+ /// 2D engine
+ std::unique_ptr<Engines::Fermi2D> fermi_2d;
+ /// Compute engine
+ std::unique_ptr<Engines::KeplerCompute> kepler_compute;
+ /// DMA engine
+ std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+ /// Inline memory engine
+ std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+ /// Shader build notifier
+ std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
+ /// When true, we are about to shut down emulation session, so terminate outstanding tasks
+ std::atomic_bool shutting_down{};
+
+ std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+ std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+ std::mutex sync_mutex;
+ std::mutex device_mutex;
+
+ std::condition_variable sync_cv;
+
+ struct FlushRequest {
+ explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
+ : fence{fence_}, addr{addr_}, size{size_} {}
+ u64 fence;
+ VAddr addr;
+ std::size_t size;
+ };
+
+ std::list<FlushRequest> flush_requests;
+ std::atomic<u64> current_flush_fence{};
+ u64 last_flush_fence{};
+ std::mutex flush_request_mutex;
+
+ const bool is_async;
+
+ VideoCommon::GPUThread::ThreadManager gpu_thread;
+ std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+
+#define ASSERT_REG_POSITION(field_name, position) \
+ static_assert(offsetof(Regs, field_name) == position * 4, \
+ "Field " #field_name " has invalid position")
+
+ ASSERT_REG_POSITION(semaphore_address, 0x4);
+ ASSERT_REG_POSITION(semaphore_sequence, 0x6);
+ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
+ ASSERT_REG_POSITION(reference_count, 0x14);
+ ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
+ ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ ASSERT_REG_POSITION(fence_value, 0x1C);
+ ASSERT_REG_POSITION(fence_action, 0x1D);
+
+ ASSERT_REG_POSITION(acquire_mode, 0x100);
+ ASSERT_REG_POSITION(acquire_source, 0x101);
+ ASSERT_REG_POSITION(acquire_active, 0x102);
+ ASSERT_REG_POSITION(acquire_timeout, 0x103);
+ ASSERT_REG_POSITION(acquire_value, 0x104);
+
+#undef ASSERT_REG_POSITION
+
+ enum class GpuSemaphoreOperation {
+ AcquireEqual = 0x1,
+ WriteLong = 0x2,
+ AcquireGequal = 0x4,
+ AcquireMask = 0x8,
+ };
+};
+
+GPU::GPU(Core::System& system, bool is_async, bool use_nvdec)
+ : impl{std::make_unique<Impl>(*this, system, is_async, use_nvdec)} {}
+
+GPU::~GPU() = default;
+
+void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer) {
+ impl->BindRenderer(std::move(renderer));
}
-Engines::Maxwell3D& GPU::Maxwell3D() {
- return *maxwell_3d;
+void GPU::CallMethod(const MethodCall& method_call) {
+ impl->CallMethod(method_call);
}
-const Engines::Maxwell3D& GPU::Maxwell3D() const {
- return *maxwell_3d;
+void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+ u32 methods_pending) {
+ impl->CallMultiMethod(method, subchannel, base_start, amount, methods_pending);
}
-Engines::KeplerCompute& GPU::KeplerCompute() {
- return *kepler_compute;
+void GPU::FlushCommands() {
+ impl->FlushCommands();
}
-const Engines::KeplerCompute& GPU::KeplerCompute() const {
- return *kepler_compute;
+void GPU::SyncGuestHost() {
+ impl->SyncGuestHost();
}
-MemoryManager& GPU::MemoryManager() {
- return *memory_manager;
+void GPU::OnCommandListEnd() {
+ impl->OnCommandListEnd();
}
-const MemoryManager& GPU::MemoryManager() const {
- return *memory_manager;
+u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
+ return impl->RequestFlush(addr, size);
}
-DmaPusher& GPU::DmaPusher() {
- return *dma_pusher;
+u64 GPU::CurrentFlushRequestFence() const {
+ return impl->CurrentFlushRequestFence();
}
-Tegra::CDmaPusher& GPU::CDmaPusher() {
- return *cdma_pusher;
+void GPU::TickWork() {
+ impl->TickWork();
}
-const DmaPusher& GPU::DmaPusher() const {
- return *dma_pusher;
+Engines::Maxwell3D& GPU::Maxwell3D() {
+ return impl->Maxwell3D();
}
-const Tegra::CDmaPusher& GPU::CDmaPusher() const {
- return *cdma_pusher;
+const Engines::Maxwell3D& GPU::Maxwell3D() const {
+ return impl->Maxwell3D();
}
-void GPU::WaitFence(u32 syncpoint_id, u32 value) {
- // Synced GPU, is always in sync
- if (!is_async) {
- return;
- }
- if (syncpoint_id == UINT32_MAX) {
- // TODO: Research what this does.
- LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
- return;
- }
- MICROPROFILE_SCOPE(GPU_wait);
- std::unique_lock lock{sync_mutex};
- sync_cv.wait(lock, [=, this] {
- if (shutting_down.load(std::memory_order_relaxed)) {
- // We're shutting down, ensure no threads continue to wait for the next syncpoint
- return true;
- }
- return syncpoints.at(syncpoint_id).load() >= value;
- });
-}
-
-void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
- auto& syncpoint = syncpoints.at(syncpoint_id);
- syncpoint++;
- std::lock_guard lock{sync_mutex};
- sync_cv.notify_all();
- auto& interrupt = syncpt_interrupts.at(syncpoint_id);
- if (!interrupt.empty()) {
- u32 value = syncpoint.load();
- auto it = interrupt.begin();
- while (it != interrupt.end()) {
- if (value >= *it) {
- TriggerCpuInterrupt(syncpoint_id, *it);
- it = interrupt.erase(it);
- continue;
- }
- it++;
- }
- }
+Engines::KeplerCompute& GPU::KeplerCompute() {
+ return impl->KeplerCompute();
}
-u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
- return syncpoints.at(syncpoint_id).load();
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+ return impl->KeplerCompute();
}
-void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
- auto& interrupt = syncpt_interrupts.at(syncpoint_id);
- bool contains = std::any_of(interrupt.begin(), interrupt.end(),
- [value](u32 in_value) { return in_value == value; });
- if (contains) {
- return;
- }
- interrupt.emplace_back(value);
+Tegra::MemoryManager& GPU::MemoryManager() {
+ return impl->MemoryManager();
}
-bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
- std::lock_guard lock{sync_mutex};
- auto& interrupt = syncpt_interrupts.at(syncpoint_id);
- const auto iter =
- std::find_if(interrupt.begin(), interrupt.end(),
- [value](u32 interrupt_value) { return value == interrupt_value; });
+const Tegra::MemoryManager& GPU::MemoryManager() const {
+ return impl->MemoryManager();
+}
- if (iter == interrupt.end()) {
- return false;
- }
- interrupt.erase(iter);
- return true;
+Tegra::DmaPusher& GPU::DmaPusher() {
+ return impl->DmaPusher();
}
-u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
- std::unique_lock lck{flush_request_mutex};
- const u64 fence = ++last_flush_fence;
- flush_requests.emplace_back(fence, addr, size);
- return fence;
+const Tegra::DmaPusher& GPU::DmaPusher() const {
+ return impl->DmaPusher();
}
-void GPU::TickWork() {
- std::unique_lock lck{flush_request_mutex};
- while (!flush_requests.empty()) {
- auto& request = flush_requests.front();
- const u64 fence = request.fence;
- const VAddr addr = request.addr;
- const std::size_t size = request.size;
- flush_requests.pop_front();
- flush_request_mutex.unlock();
- rasterizer->FlushRegion(addr, size);
- current_flush_fence.store(fence);
- flush_request_mutex.lock();
- }
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+ return impl->CDmaPusher();
}
-u64 GPU::GetTicks() const {
- // This values were reversed engineered by fincs from NVN
- // The gpu clock is reported in units of 385/625 nanoseconds
- constexpr u64 gpu_ticks_num = 384;
- constexpr u64 gpu_ticks_den = 625;
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+ return impl->CDmaPusher();
+}
- u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
- if (Settings::values.use_fast_gpu_time.GetValue()) {
- nanoseconds /= 256;
- }
- const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
- const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
- return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+VideoCore::RendererBase& GPU::Renderer() {
+ return impl->Renderer();
}
-void GPU::RendererFrameEndNotify() {
- system.GetPerfStats().EndGameFrame();
+const VideoCore::RendererBase& GPU::Renderer() const {
+ return impl->Renderer();
}
-void GPU::FlushCommands() {
- rasterizer->FlushCommands();
+VideoCore::ShaderNotify& GPU::ShaderNotify() {
+ return impl->ShaderNotify();
}
-void GPU::SyncGuestHost() {
- rasterizer->SyncGuestHost();
+const VideoCore::ShaderNotify& GPU::ShaderNotify() const {
+ return impl->ShaderNotify();
}
-enum class GpuSemaphoreOperation {
- AcquireEqual = 0x1,
- WriteLong = 0x2,
- AcquireGequal = 0x4,
- AcquireMask = 0x8,
-};
+void GPU::WaitFence(u32 syncpoint_id, u32 value) {
+ impl->WaitFence(syncpoint_id, value);
+}
-void GPU::CallMethod(const MethodCall& method_call) {
- LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
- method_call.subchannel);
+void GPU::IncrementSyncPoint(u32 syncpoint_id) {
+ impl->IncrementSyncPoint(syncpoint_id);
+}
- ASSERT(method_call.subchannel < bound_engines.size());
+u32 GPU::GetSyncpointValue(u32 syncpoint_id) const {
+ return impl->GetSyncpointValue(syncpoint_id);
+}
- if (ExecuteMethodOnEngine(method_call.method)) {
- CallEngineMethod(method_call);
- } else {
- CallPullerMethod(method_call);
- }
+void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) {
+ impl->RegisterSyncptInterrupt(syncpoint_id, value);
}
-void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
- u32 methods_pending) {
- LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
-
- ASSERT(subchannel < bound_engines.size());
-
- if (ExecuteMethodOnEngine(method)) {
- CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
- } else {
- for (std::size_t i = 0; i < amount; i++) {
- CallPullerMethod(MethodCall{
- method,
- base_start[i],
- subchannel,
- methods_pending - static_cast<u32>(i),
- });
- }
- }
+bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) {
+ return impl->CancelSyncptInterrupt(syncpoint_id, value);
}
-bool GPU::ExecuteMethodOnEngine(u32 method) {
- const auto buffer_method = static_cast<BufferMethods>(method);
- return buffer_method >= BufferMethods::NonPullerMethods;
-}
-
-void GPU::CallPullerMethod(const MethodCall& method_call) {
- regs.reg_array[method_call.method] = method_call.argument;
- const auto method = static_cast<BufferMethods>(method_call.method);
-
- switch (method) {
- case BufferMethods::BindObject: {
- ProcessBindMethod(method_call);
- break;
- }
- case BufferMethods::Nop:
- case BufferMethods::SemaphoreAddressHigh:
- case BufferMethods::SemaphoreAddressLow:
- case BufferMethods::SemaphoreSequence:
- case BufferMethods::UnkCacheFlush:
- case BufferMethods::WrcacheFlush:
- case BufferMethods::FenceValue:
- break;
- case BufferMethods::RefCnt:
- rasterizer->SignalReference();
- break;
- case BufferMethods::FenceAction:
- ProcessFenceActionMethod();
- break;
- case BufferMethods::WaitForInterrupt:
- ProcessWaitForInterruptMethod();
- break;
- case BufferMethods::SemaphoreTrigger: {
- ProcessSemaphoreTriggerMethod();
- break;
- }
- case BufferMethods::NotifyIntr: {
- // TODO(Kmather73): Research and implement this method.
- LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
- break;
- }
- case BufferMethods::Unk28: {
- // TODO(Kmather73): Research and implement this method.
- LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
- break;
- }
- case BufferMethods::SemaphoreAcquire: {
- ProcessSemaphoreAcquire();
- break;
- }
- case BufferMethods::SemaphoreRelease: {
- ProcessSemaphoreRelease();
- break;
- }
- case BufferMethods::Yield: {
- // TODO(Kmather73): Research and implement this method.
- LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
- break;
- }
- default:
- LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
- break;
- }
-}
-
-void GPU::CallEngineMethod(const MethodCall& method_call) {
- const EngineID engine = bound_engines[method_call.subchannel];
-
- switch (engine) {
- case EngineID::FERMI_TWOD_A:
- fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
- break;
- case EngineID::MAXWELL_B:
- maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
- break;
- case EngineID::KEPLER_COMPUTE_B:
- kepler_compute->CallMethod(method_call.method, method_call.argument,
- method_call.IsLastCall());
- break;
- case EngineID::MAXWELL_DMA_COPY_A:
- maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
- break;
- case EngineID::KEPLER_INLINE_TO_MEMORY_B:
- kepler_memory->CallMethod(method_call.method, method_call.argument,
- method_call.IsLastCall());
- break;
- default:
- UNIMPLEMENTED_MSG("Unimplemented engine");
- }
-}
-
-void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
- u32 methods_pending) {
- const EngineID engine = bound_engines[subchannel];
-
- switch (engine) {
- case EngineID::FERMI_TWOD_A:
- fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
- break;
- case EngineID::MAXWELL_B:
- maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
- break;
- case EngineID::KEPLER_COMPUTE_B:
- kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
- break;
- case EngineID::MAXWELL_DMA_COPY_A:
- maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
- break;
- case EngineID::KEPLER_INLINE_TO_MEMORY_B:
- kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
- break;
- default:
- UNIMPLEMENTED_MSG("Unimplemented engine");
- }
-}
-
-void GPU::ProcessBindMethod(const MethodCall& method_call) {
- // Bind the current subchannel to the desired engine id.
- LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
- method_call.argument);
- const auto engine_id = static_cast<EngineID>(method_call.argument);
- bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
- switch (engine_id) {
- case EngineID::FERMI_TWOD_A:
- dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
- break;
- case EngineID::MAXWELL_B:
- dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
- break;
- case EngineID::KEPLER_COMPUTE_B:
- dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
- break;
- case EngineID::MAXWELL_DMA_COPY_A:
- dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
- break;
- case EngineID::KEPLER_INLINE_TO_MEMORY_B:
- dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
- break;
- default:
- UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
- }
-}
-
-void GPU::ProcessFenceActionMethod() {
- switch (regs.fence_action.op) {
- case FenceOperation::Acquire:
- WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
- break;
- case FenceOperation::Increment:
- IncrementSyncPoint(regs.fence_action.syncpoint_id);
- break;
- default:
- UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
- }
-}
-
-void GPU::ProcessWaitForInterruptMethod() {
- // TODO(bunnei) ImplementMe
- LOG_WARNING(HW_GPU, "(STUBBED) called");
-}
-
-void GPU::ProcessSemaphoreTriggerMethod() {
- const auto semaphoreOperationMask = 0xF;
- const auto op =
- static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
- if (op == GpuSemaphoreOperation::WriteLong) {
- struct Block {
- u32 sequence;
- u32 zeros = 0;
- u64 timestamp;
- };
+u64 GPU::GetTicks() const {
+ return impl->GetTicks();
+}
- Block block{};
- block.sequence = regs.semaphore_sequence;
- // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
- // CoreTiming
- block.timestamp = GetTicks();
- memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
- sizeof(block));
- } else {
- const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
- if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
- (op == GpuSemaphoreOperation::AcquireGequal &&
- static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
- (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
- // Nothing to do in this case
- } else {
- regs.acquire_source = true;
- regs.acquire_value = regs.semaphore_sequence;
- if (op == GpuSemaphoreOperation::AcquireEqual) {
- regs.acquire_active = true;
- regs.acquire_mode = false;
- } else if (op == GpuSemaphoreOperation::AcquireGequal) {
- regs.acquire_active = true;
- regs.acquire_mode = true;
- } else if (op == GpuSemaphoreOperation::AcquireMask) {
- // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
- // semaphore_sequence, gives a non-0 result
- LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
- } else {
- LOG_ERROR(HW_GPU, "Invalid semaphore operation");
- }
- }
- }
+bool GPU::IsAsync() const {
+ return impl->IsAsync();
}
-void GPU::ProcessSemaphoreRelease() {
- memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release);
+bool GPU::UseNvdec() const {
+ return impl->UseNvdec();
}
-void GPU::ProcessSemaphoreAcquire() {
- const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
- const auto value = regs.semaphore_acquire;
- if (word != value) {
- regs.acquire_active = true;
- regs.acquire_value = value;
- // TODO(kemathe73) figure out how to do the acquire_timeout
- regs.acquire_mode = false;
- regs.acquire_source = false;
- }
+void GPU::RendererFrameEndNotify() {
+ impl->RendererFrameEndNotify();
}
void GPU::Start() {
- gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
- cpu_context = renderer->GetRenderWindow().CreateSharedContext();
- cpu_context->MakeCurrent();
+ impl->Start();
}
void GPU::ObtainContext() {
- cpu_context->MakeCurrent();
+ impl->ObtainContext();
}
void GPU::ReleaseContext() {
- cpu_context->DoneCurrent();
+ impl->ReleaseContext();
}
void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
- gpu_thread.SubmitList(std::move(entries));
+ impl->PushGPUEntries(std::move(entries));
}
void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
- if (!use_nvdec) {
- return;
- }
-
- if (!cdma_pusher) {
- cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
- }
-
- // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
- // TODO(ameerj): RE proper async nvdec operation
- // gpu_thread.SubmitCommandBuffer(std::move(entries));
-
- cdma_pusher->ProcessEntries(std::move(entries));
+ impl->PushCommandBuffer(entries);
}
void GPU::ClearCdmaInstance() {
- cdma_pusher.reset();
+ impl->ClearCdmaInstance();
}
void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
- gpu_thread.SwapBuffers(framebuffer);
+ impl->SwapBuffers(framebuffer);
}
void GPU::FlushRegion(VAddr addr, u64 size) {
- gpu_thread.FlushRegion(addr, size);
+ impl->FlushRegion(addr, size);
}
void GPU::InvalidateRegion(VAddr addr, u64 size) {
- gpu_thread.InvalidateRegion(addr, size);
+ impl->InvalidateRegion(addr, size);
}
void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
- gpu_thread.FlushAndInvalidateRegion(addr, size);
-}
-
-void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
- auto& interrupt_manager = system.InterruptManager();
- interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
-}
-
-void GPU::ShutDown() {
- // Signal that threads should no longer block on syncpoint fences
- shutting_down.store(true, std::memory_order_relaxed);
- sync_cv.notify_all();
-
- gpu_thread.ShutDown();
-}
-
-void GPU::OnCommandListEnd() {
- if (is_async) {
- // This command only applies to asynchronous GPU mode
- gpu_thread.OnCommandListEnd();
- }
+ impl->FlushAndInvalidateRegion(addr, size);
}
} // namespace Tegra
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a8e98e51b..05e5c94f3 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -4,28 +4,12 @@
#pragma once
-#include <array>
-#include <atomic>
-#include <condition_variable>
-#include <list>
#include <memory>
-#include <mutex>
+
+#include "common/bit_field.h"
#include "common/common_types.h"
-#include "core/hle/service/nvdrv/nvdata.h"
-#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/cdma_pusher.h"
-#include "video_core/dma_pusher.h"
#include "video_core/framebuffer_config.h"
-#include "video_core/gpu_thread.h"
-
-using CacheAddr = std::uintptr_t;
-[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
- return reinterpret_cast<CacheAddr>(host_ptr);
-}
-
-[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
- return reinterpret_cast<u8*>(cache_addr);
-}
namespace Core {
namespace Frontend {
@@ -40,6 +24,9 @@ class ShaderNotify;
} // namespace VideoCore
namespace Tegra {
+class DmaPusher;
+class CDmaPusher;
+struct CommandList;
enum class RenderTargetFormat : u32 {
NONE = 0x0,
@@ -138,7 +125,18 @@ public:
}
};
- explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
+ enum class FenceOperation : u32 {
+ Acquire = 0,
+ Increment = 1,
+ };
+
+ union FenceAction {
+ u32 raw;
+ BitField<0, 1, FenceOperation> op;
+ BitField<8, 24, u32> syncpoint_id;
+ };
+
+ explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
~GPU();
/// Binds a renderer to the GPU.
@@ -162,9 +160,7 @@ public:
[[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
/// Obtains current flush request fence id.
- [[nodiscard]] u64 CurrentFlushRequestFence() const {
- return current_flush_fence.load(std::memory_order_relaxed);
- }
+ [[nodiscard]] u64 CurrentFlushRequestFence() const;
/// Tick pending requests within the GPU.
void TickWork();
@@ -200,27 +196,16 @@ public:
[[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
/// Returns a reference to the underlying renderer.
- [[nodiscard]] VideoCore::RendererBase& Renderer() {
- return *renderer;
- }
+ [[nodiscard]] VideoCore::RendererBase& Renderer();
/// Returns a const reference to the underlying renderer.
- [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
- return *renderer;
- }
+ [[nodiscard]] const VideoCore::RendererBase& Renderer() const;
/// Returns a reference to the shader notifier.
- [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
- return *shader_notify;
- }
+ [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify();
/// Returns a const reference to the shader notifier.
- [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
- return *shader_notify;
- }
-
- // Stops the GPU execution and waits for the GPU to finish working
- void ShutDown();
+ [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const;
/// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
void WaitFence(u32 syncpoint_id, u32 value);
@@ -235,80 +220,12 @@ public:
[[nodiscard]] u64 GetTicks() const;
- [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
- return std::unique_lock{sync_mutex};
- }
-
- [[nodiscard]] bool IsAsync() const {
- return is_async;
- }
+ [[nodiscard]] bool IsAsync() const;
- [[nodiscard]] bool UseNvdec() const {
- return use_nvdec;
- }
+ [[nodiscard]] bool UseNvdec() const;
void RendererFrameEndNotify();
- enum class FenceOperation : u32 {
- Acquire = 0,
- Increment = 1,
- };
-
- union FenceAction {
- u32 raw;
- BitField<0, 1, FenceOperation> op;
- BitField<8, 24, u32> syncpoint_id;
-
- [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
- FenceAction result{};
- result.op.Assign(op);
- result.syncpoint_id.Assign(syncpoint_id);
- return {result.raw};
- }
- };
-
- struct Regs {
- static constexpr size_t NUM_REGS = 0x40;
-
- union {
- struct {
- INSERT_PADDING_WORDS_NOINIT(0x4);
- struct {
- u32 address_high;
- u32 address_low;
-
- [[nodiscard]] GPUVAddr SemaphoreAddress() const {
- return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
- address_low);
- }
- } semaphore_address;
-
- u32 semaphore_sequence;
- u32 semaphore_trigger;
- INSERT_PADDING_WORDS_NOINIT(0xC);
-
- // The pusher and the puller share the reference counter, the pusher only has read
- // access
- u32 reference_count;
- INSERT_PADDING_WORDS_NOINIT(0x5);
-
- u32 semaphore_acquire;
- u32 semaphore_release;
- u32 fence_value;
- FenceAction fence_action;
- INSERT_PADDING_WORDS_NOINIT(0xE2);
-
- // Puller state
- u32 acquire_mode;
- u32 acquire_source;
- u32 acquire_active;
- u32 acquire_timeout;
- u32 acquire_value;
- };
- std::array<u32, NUM_REGS> reg_array;
- };
- } regs{};
-
/// Performs any additional setup necessary in order to begin GPU emulation.
/// This can be used to launch any necessary threads and register any necessary
/// core timing events.
@@ -341,104 +258,9 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size);
-protected:
- void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
-
-private:
- void ProcessBindMethod(const MethodCall& method_call);
- void ProcessFenceActionMethod();
- void ProcessWaitForInterruptMethod();
- void ProcessSemaphoreTriggerMethod();
- void ProcessSemaphoreRelease();
- void ProcessSemaphoreAcquire();
-
- /// Calls a GPU puller method.
- void CallPullerMethod(const MethodCall& method_call);
-
- /// Calls a GPU engine method.
- void CallEngineMethod(const MethodCall& method_call);
-
- /// Calls a GPU engine multivalue method.
- void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
- u32 methods_pending);
-
- /// Determines where the method should be executed.
- [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
-
-protected:
- Core::System& system;
- std::unique_ptr<Tegra::MemoryManager> memory_manager;
- std::unique_ptr<Tegra::DmaPusher> dma_pusher;
- std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
- std::unique_ptr<VideoCore::RendererBase> renderer;
- VideoCore::RasterizerInterface* rasterizer = nullptr;
- const bool use_nvdec;
-
private:
- /// Mapping of command subchannels to their bound engine ids
- std::array<EngineID, 8> bound_engines = {};
- /// 3D engine
- std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
- /// 2D engine
- std::unique_ptr<Engines::Fermi2D> fermi_2d;
- /// Compute engine
- std::unique_ptr<Engines::KeplerCompute> kepler_compute;
- /// DMA engine
- std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
- /// Inline memory engine
- std::unique_ptr<Engines::KeplerMemory> kepler_memory;
- /// Shader build notifier
- std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
- /// When true, we are about to shut down emulation session, so terminate outstanding tasks
- std::atomic_bool shutting_down{};
-
- std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
-
- std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
-
- std::mutex sync_mutex;
- std::mutex device_mutex;
-
- std::condition_variable sync_cv;
-
- struct FlushRequest {
- explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
- : fence{fence_}, addr{addr_}, size{size_} {}
- u64 fence;
- VAddr addr;
- std::size_t size;
- };
-
- std::list<FlushRequest> flush_requests;
- std::atomic<u64> current_flush_fence{};
- u64 last_flush_fence{};
- std::mutex flush_request_mutex;
-
- const bool is_async;
-
- VideoCommon::GPUThread::ThreadManager gpu_thread;
- std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+ struct Impl;
+ std::unique_ptr<Impl> impl;
};
-#define ASSERT_REG_POSITION(field_name, position) \
- static_assert(offsetof(GPU::Regs, field_name) == position * 4, \
- "Field " #field_name " has invalid position")
-
-ASSERT_REG_POSITION(semaphore_address, 0x4);
-ASSERT_REG_POSITION(semaphore_sequence, 0x6);
-ASSERT_REG_POSITION(semaphore_trigger, 0x7);
-ASSERT_REG_POSITION(reference_count, 0x14);
-ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
-ASSERT_REG_POSITION(semaphore_release, 0x1B);
-ASSERT_REG_POSITION(fence_value, 0x1C);
-ASSERT_REG_POSITION(fence_action, 0x1D);
-
-ASSERT_REG_POSITION(acquire_mode, 0x100);
-ASSERT_REG_POSITION(acquire_source, 0x101);
-ASSERT_REG_POSITION(acquire_active, 0x102);
-ASSERT_REG_POSITION(acquire_timeout, 0x103);
-ASSERT_REG_POSITION(acquire_value, 0x104);
-
-#undef ASSERT_REG_POSITION
-
} // namespace Tegra
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 46f642b19..9547f277a 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -17,9 +17,9 @@
namespace VideoCommon::GPUThread {
/// Runs the GPU thread
-static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
- Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
- SynchState& state) {
+static void RunThread(std::stop_token stop_token, Core::System& system,
+ VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
+ Tegra::DmaPusher& dma_pusher, SynchState& state) {
std::string name = "yuzu:GPU";
MicroProfileOnThreadCreate(name.c_str());
SCOPE_EXIT({ MicroProfileOnThreadExit(); });
@@ -28,20 +28,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
system.RegisterHostThread();
- // Wait for first GPU command before acquiring the window context
- state.queue.Wait();
-
- // If emulation was stopped during disk shader loading, abort before trying to acquire context
- if (!state.is_running) {
- return;
- }
-
auto current_context = context.Acquire();
VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer();
- CommandDataContainer next;
- while (state.is_running) {
- next = state.queue.PopWait();
+ while (!stop_token.stop_requested()) {
+ CommandDataContainer next = state.queue.PopWait(stop_token);
+ if (stop_token.stop_requested()) {
+ break;
+ }
if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
@@ -55,8 +49,6 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
rasterizer->FlushRegion(flush->addr, flush->size);
} else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
- } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
- ASSERT(state.is_running == false);
} else {
UNREACHABLE();
}
@@ -73,16 +65,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
: system{system_}, is_async{is_async_} {}
-ThreadManager::~ThreadManager() {
- ShutDown();
-}
+ThreadManager::~ThreadManager() = default;
void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher) {
rasterizer = renderer.ReadRasterizer();
- thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
- std::ref(dma_pusher), std::ref(state));
+ thread = std::jthread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+ std::ref(dma_pusher), std::ref(state));
}
void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
@@ -117,26 +107,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
rasterizer->OnCPUWrite(addr, size);
}
-void ThreadManager::ShutDown() {
- if (!state.is_running) {
- return;
- }
-
- {
- std::lock_guard lk(state.write_lock);
- state.is_running = false;
- state.cv.notify_all();
- }
-
- if (!thread.joinable()) {
- return;
- }
-
- // Notify GPU thread that a shutdown is pending
- PushCommand(EndProcessingCommand());
- thread.join();
-}
-
void ThreadManager::OnCommandListEnd() {
PushCommand(OnCommandListEndCommand());
}
@@ -152,9 +122,8 @@ u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
if (block) {
- state.cv.wait(lk, [this, fence] {
- return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
- !state.is_running;
+ state.cv.wait(lk, thread.get_stop_token(), [this, fence] {
+ return fence <= state.signaled_fence.load(std::memory_order_relaxed);
});
}
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 11a648f38..00984188e 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -33,9 +33,6 @@ class RendererBase;
namespace VideoCommon::GPUThread {
-/// Command to signal to the GPU thread that processing has ended
-struct EndProcessingCommand final {};
-
/// Command to signal to the GPU thread that a command list is ready for processing
struct SubmitListCommand final {
explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
@@ -83,7 +80,7 @@ struct OnCommandListEndCommand final {};
struct GPUTickCommand final {};
using CommandData =
- std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+ std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
GPUTickCommand>;
@@ -100,14 +97,12 @@ struct CommandDataContainer {
/// Struct used to synchronize the GPU thread
struct SynchState final {
- std::atomic_bool is_running{true};
-
- using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+ using CommandQueue = Common::SPSCQueue<CommandDataContainer, true>;
std::mutex write_lock;
CommandQueue queue;
u64 last_fence{};
std::atomic<u64> signaled_fence{};
- std::condition_variable cv;
+ std::condition_variable_any cv;
};
/// Class used to manage the GPU thread
@@ -135,9 +130,6 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size);
- // Stops the GPU execution and waits for the GPU to finish working
- void ShutDown();
-
void OnCommandListEnd();
private:
@@ -149,7 +141,7 @@ private:
VideoCore::RasterizerInterface* rasterizer = nullptr;
SynchState state;
- std::thread thread;
+ std::jthread thread;
};
} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index c9cff7450..20d748c12 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -6,7 +6,6 @@ set(SHADER_FILES
convert_float_to_depth.frag
full_screen_triangle.vert
opengl_copy_bc4.comp
- opengl_copy_bgra.comp
opengl_present.frag
opengl_present.vert
pitch_unswizzle.comp
diff --git a/src/video_core/host_shaders/opengl_copy_bgra.comp b/src/video_core/host_shaders/opengl_copy_bgra.comp
deleted file mode 100644
index 2571a4abf..000000000
--- a/src/video_core/host_shaders/opengl_copy_bgra.comp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2021 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#version 430 core
-
-layout (local_size_x = 4, local_size_y = 4) in;
-
-layout(binding = 0, rgba8) readonly uniform image2DArray bgr_input;
-layout(binding = 1, rgba8) writeonly uniform image2DArray bgr_output;
-
-void main() {
- vec4 color = imageLoad(bgr_input, ivec3(gl_GlobalInvocationID));
- imageStore(bgr_output, ivec3(gl_GlobalInvocationID), color.bgra);
-}
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index aac851253..73231061a 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -8,6 +8,7 @@
#include <array>
#include <cstring>
#include <iterator>
+#include <list>
#include <memory>
#include <mutex>
#include <optional>
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 07a995f7d..187a28e4d 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -147,8 +147,7 @@ void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) {
glClearNamedBufferSubData(dest_buffer.Handle(), GL_R32UI, static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(size / sizeof(u32)), GL_RED, GL_UNSIGNED_INT,
- &value);
+ static_cast<GLsizeiptr>(size), GL_RED, GL_UNSIGNED_INT, &value);
}
void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b0aee6cc1..8c3ca3d82 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -20,6 +20,7 @@
#include "video_core/surface.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
+#include "video_core/texture_cache/util.h"
namespace OpenGL {
namespace {
@@ -461,7 +462,7 @@ bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) {
if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) {
return false;
}
- if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
+ if (IsPixelFormatBGR(dst.info.format) != IsPixelFormatBGR(src.info.format)) {
return false;
}
return true;
@@ -473,7 +474,7 @@ void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src,
ASSERT(src.info.type == ImageType::e3D);
util_shaders.CopyBC4(dst, src, copies);
} else if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
- util_shaders.CopyBGR(dst, src, copies);
+ bgr_copy_pass.CopyBGR(dst, src, copies);
} else {
UNREACHABLE();
}
@@ -1112,4 +1113,37 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
framebuffer.handle = handle;
}
+void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image,
+ std::span<const VideoCommon::ImageCopy> copies) {
+ static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
+ const u32 requested_pbo_size =
+ std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes);
+
+ if (bgr_pbo_size < requested_pbo_size) {
+ bgr_pbo.Create();
+ bgr_pbo_size = requested_pbo_size;
+ glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY);
+ }
+ for (const ImageCopy& copy : copies) {
+ ASSERT(copy.src_offset == zero_offset);
+ ASSERT(copy.dst_offset == zero_offset);
+
+ // Copy from source to PBO
+ glPixelStorei(GL_PACK_ALIGNMENT, 1);
+ glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle);
+ glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+ copy.src_subresource.num_layers, src_image.GlFormat(),
+ src_image.GlType(), static_cast<GLsizei>(bgr_pbo_size), nullptr);
+
+ // Copy from PBO to destination in desired GL format
+ glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+ glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr_pbo.handle);
+ glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+ copy.dst_subresource.num_layers, dst_image.GlFormat(),
+ dst_image.GlType(), nullptr);
+ }
+}
+
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 4a4f6301c..1ca2c90be 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -12,6 +12,7 @@
#include "shader_recompiler/shader_info.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/util_shaders.h"
+#include "video_core/texture_cache/image_view_base.h"
#include "video_core/texture_cache/texture_cache_base.h"
namespace OpenGL {
@@ -47,6 +48,19 @@ struct FormatProperties {
bool is_compressed;
};
+class BGRCopyPass {
+public:
+ BGRCopyPass() = default;
+ ~BGRCopyPass() = default;
+
+ void CopyBGR(Image& dst_image, Image& src_image,
+ std::span<const VideoCommon::ImageCopy> copies);
+
+private:
+ OGLBuffer bgr_pbo;
+ size_t bgr_pbo_size{};
+};
+
class TextureCacheRuntime {
friend Framebuffer;
friend Image;
@@ -118,6 +132,7 @@ private:
const Device& device;
StateTracker& state_tracker;
UtilShaders util_shaders;
+ BGRCopyPass bgr_copy_pass;
std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties;
bool has_broken_texture_view_formats = false;
@@ -162,6 +177,14 @@ public:
return texture.handle;
}
+ GLuint GlFormat() const noexcept {
+ return gl_format;
+ }
+
+ GLuint GlType() const noexcept {
+ return gl_type;
+ }
+
private:
void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 672f94bfc..39158aa3e 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -52,7 +52,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
{GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UFLOAT
{GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SFLOAT
{GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4_UNORM
- {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM
+ {GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV}, // B8G8R8A8_UNORM
{GL_RGBA32F, GL_RGBA, GL_FLOAT}, // R32G32B32A32_FLOAT
{GL_RGBA32I, GL_RGBA_INTEGER, GL_INT}, // R32G32B32A32_SINT
{GL_RG32F, GL_RG, GL_FLOAT}, // R32G32_FLOAT
@@ -81,7 +81,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
{GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8_UNORM
{GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM
{GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM
- {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE}, // B8G8R8A8_SRGB
+ {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV}, // B8G8R8A8_SRGB
{GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB
{GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB
{GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 333f35a1c..897c380b3 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -14,7 +14,6 @@
#include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
#include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
#include "video_core/host_shaders/opengl_copy_bc4_comp.h"
-#include "video_core/host_shaders/opengl_copy_bgra_comp.h"
#include "video_core/host_shaders/pitch_unswizzle_comp.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_shader_util.h"
@@ -44,11 +43,6 @@ namespace {
OGLProgram MakeProgram(std::string_view source) {
return CreateProgram(source, GL_COMPUTE_SHADER);
}
-
-size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) {
- return static_cast<size_t>(copy.extent.width * copy.extent.height *
- copy.src_subresource.num_layers);
-}
} // Anonymous namespace
UtilShaders::UtilShaders(ProgramManager& program_manager_)
@@ -56,7 +50,6 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)),
block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
- copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_buffer.Create();
@@ -255,43 +248,6 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im
program_manager.RestoreGuestCompute();
}
-void UtilShaders::CopyBGR(Image& dst_image, Image& src_image,
- std::span<const VideoCommon::ImageCopy> copies) {
- static constexpr GLuint BINDING_INPUT_IMAGE = 0;
- static constexpr GLuint BINDING_OUTPUT_IMAGE = 1;
- static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
- const u32 bytes_per_block = BytesPerBlock(dst_image.info.format);
- switch (bytes_per_block) {
- case 2:
- // BGR565 copy
- for (const ImageCopy& copy : copies) {
- ASSERT(copy.src_offset == zero_offset);
- ASSERT(copy.dst_offset == zero_offset);
- bgr_copy_pass.Execute(dst_image, src_image, copy);
- }
- break;
- case 4: {
- // BGRA8 copy
- program_manager.BindComputeProgram(copy_bgra_program.handle);
- constexpr GLenum FORMAT = GL_RGBA8;
- for (const ImageCopy& copy : copies) {
- ASSERT(copy.src_offset == zero_offset);
- ASSERT(copy.dst_offset == zero_offset);
- glBindImageTexture(BINDING_INPUT_IMAGE, src_image.StorageHandle(),
- copy.src_subresource.base_level, GL_FALSE, 0, GL_READ_ONLY, FORMAT);
- glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.StorageHandle(),
- copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, FORMAT);
- glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth);
- }
- program_manager.RestoreGuestCompute();
- break;
- }
- default:
- UNREACHABLE();
- break;
- }
-}
-
GLenum StoreFormat(u32 bytes_per_block) {
switch (bytes_per_block) {
case 1:
@@ -309,36 +265,4 @@ GLenum StoreFormat(u32 bytes_per_block) {
return GL_R8UI;
}
-void Bgr565CopyPass::Execute(const Image& dst_image, const Image& src_image,
- const ImageCopy& copy) {
- if (CopyBufferCreationNeeded(copy)) {
- CreateNewCopyBuffer(copy, GL_TEXTURE_2D_ARRAY, GL_RGB565);
- }
- // Copy from source to PBO
- glPixelStorei(GL_PACK_ALIGNMENT, 1);
- glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
- glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr16_pbo.handle);
- glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
- copy.src_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
- static_cast<GLsizei>(bgr16_pbo_size), nullptr);
-
- // Copy from PBO to destination in reverse order
- glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
- glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr16_pbo.handle);
- glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
- copy.dst_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV,
- nullptr);
-}
-
-bool Bgr565CopyPass::CopyBufferCreationNeeded(const ImageCopy& copy) {
- return bgr16_pbo_size < NumPixelsInCopy(copy) * sizeof(u16);
-}
-
-void Bgr565CopyPass::CreateNewCopyBuffer(const ImageCopy& copy, GLenum target, GLuint format) {
- bgr16_pbo.Create();
- bgr16_pbo_size = NumPixelsInCopy(copy) * sizeof(u16);
- glNamedBufferData(bgr16_pbo.handle, bgr16_pbo_size, nullptr, GL_STREAM_COPY);
-}
-
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index ef881e35f..5de95ea7a 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -19,22 +19,6 @@ class ProgramManager;
struct ImageBufferMap;
-class Bgr565CopyPass {
-public:
- Bgr565CopyPass() = default;
- ~Bgr565CopyPass() = default;
-
- void Execute(const Image& dst_image, const Image& src_image,
- const VideoCommon::ImageCopy& copy);
-
-private:
- [[nodiscard]] bool CopyBufferCreationNeeded(const VideoCommon::ImageCopy& copy);
- void CreateNewCopyBuffer(const VideoCommon::ImageCopy& copy, GLenum target, GLuint format);
-
- OGLBuffer bgr16_pbo;
- size_t bgr16_pbo_size{};
-};
-
class UtilShaders {
public:
explicit UtilShaders(ProgramManager& program_manager);
@@ -55,9 +39,6 @@ public:
void CopyBC4(Image& dst_image, Image& src_image,
std::span<const VideoCommon::ImageCopy> copies);
- void CopyBGR(Image& dst_image, Image& src_image,
- std::span<const VideoCommon::ImageCopy> copies);
-
private:
ProgramManager& program_manager;
@@ -67,10 +48,7 @@ private:
OGLProgram block_linear_unswizzle_2d_program;
OGLProgram block_linear_unswizzle_3d_program;
OGLProgram pitch_unswizzle_program;
- OGLProgram copy_bgra_program;
OGLProgram copy_bc4_program;
-
- Bgr565CopyPass bgr_copy_pass;
};
GLenum StoreFormat(u32 bytes_per_block);
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 9ff0a28cd..74822814d 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -97,19 +97,14 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
Core::Frontend::EmuWindow& emu_window,
Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
std::unique_ptr<Core::Frontend::GraphicsContext> context_) try
- : RendererBase(emu_window, std::move(context_)),
- telemetry_session(telemetry_session_),
- cpu_memory(cpu_memory_),
- gpu(gpu_),
- library(OpenLibrary()),
+ : RendererBase(emu_window, std::move(context_)), telemetry_session(telemetry_session_),
+ cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary()),
instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
true, Settings::values.renderer_debug.GetValue())),
debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
surface(CreateSurface(instance, render_window)),
- device(CreateDevice(instance, dld, *surface)),
- memory_allocator(device, false),
- state_tracker(gpu),
- scheduler(device, state_tracker),
+ device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false),
+ state_tracker(gpu), scheduler(device, state_tracker),
swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
render_window.GetFramebufferLayout().height, false),
blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
@@ -149,7 +144,7 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout();
swapchain.Create(layout.width, layout.height, is_srgb);
};
- if (swapchain.IsSubOptimal() || swapchain.HasColorSpaceChanged(is_srgb)) {
+ if (swapchain.NeedsRecreation(is_srgb)) {
recreate_swapchain();
}
bool is_outdated;
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 7c0f91007..8634c3316 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -507,8 +507,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
vertex_attributes.push_back({
.location = static_cast<u32>(index),
.binding = 0,
- .format = type == 1 ? VK_FORMAT_R32_SFLOAT
- : type == 2 ? VK_FORMAT_R32_SINT : VK_FORMAT_R32_UINT,
+ .format = type == 1 ? VK_FORMAT_R32_SFLOAT
+ : type == 2 ? VK_FORMAT_R32_SINT
+ : VK_FORMAT_R32_UINT,
.offset = 0,
});
}
@@ -567,12 +568,21 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
if (!vertex_binding_divisors.empty()) {
vertex_input_ci.pNext = &input_divisor_ci;
}
+ const bool has_tess_stages = spv_modules[1] || spv_modules[2];
auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, key.state.topology);
if (input_assembly_topology == VK_PRIMITIVE_TOPOLOGY_PATCH_LIST) {
- if (!spv_modules[1] && !spv_modules[2]) {
+ if (!has_tess_stages) {
LOG_WARNING(Render_Vulkan, "Patch topology used without tessellation, using points");
input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
}
+ } else {
+ if (has_tess_stages) {
+ // The Vulkan spec requires patch list IA topology be used with tessellation
+ // shader stages. Forcing it fixes a crash on some drivers
+ LOG_WARNING(Render_Vulkan,
+ "Patch topology not used with tessellation, using patch list");
+ input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
+ }
}
const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 1d438787a..0c11c814f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -43,17 +43,10 @@ VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_)
command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
AcquireNewChunk();
AllocateWorkerCommandBuffer();
- worker_thread = std::thread(&VKScheduler::WorkerThread, this);
+ worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
}
-VKScheduler::~VKScheduler() {
- {
- std::lock_guard lock{work_mutex};
- quit = true;
- }
- work_cv.notify_all();
- worker_thread.join();
-}
+VKScheduler::~VKScheduler() = default;
void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
SubmitExecution(signal_semaphore, wait_semaphore);
@@ -135,7 +128,7 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) {
return true;
}
-void VKScheduler::WorkerThread() {
+void VKScheduler::WorkerThread(std::stop_token stop_token) {
Common::SetCurrentThreadName("yuzu:VulkanWorker");
do {
if (work_queue.empty()) {
@@ -144,8 +137,8 @@ void VKScheduler::WorkerThread() {
std::unique_ptr<CommandChunk> work;
{
std::unique_lock lock{work_mutex};
- work_cv.wait(lock, [this] { return !work_queue.empty() || quit; });
- if (quit) {
+ work_cv.wait(lock, stop_token, [this] { return !work_queue.empty(); });
+ if (stop_token.stop_requested()) {
continue;
}
work = std::move(work_queue.front());
@@ -158,7 +151,7 @@ void VKScheduler::WorkerThread() {
}
std::lock_guard reserve_lock{reserve_mutex};
chunk_reserve.push_back(std::move(work));
- } while (!quit);
+ } while (!stop_token.stop_requested());
}
void VKScheduler::AllocateWorkerCommandBuffer() {
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 759ed5a48..85fc1712f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -187,7 +187,7 @@ private:
GraphicsPipeline* graphics_pipeline = nullptr;
};
- void WorkerThread();
+ void WorkerThread(std::stop_token stop_token);
void AllocateWorkerCommandBuffer();
@@ -212,7 +212,6 @@ private:
vk::CommandBuffer current_cmdbuf;
std::unique_ptr<CommandChunk> chunk;
- std::thread worker_thread;
State state;
@@ -224,9 +223,9 @@ private:
std::vector<std::unique_ptr<CommandChunk>> chunk_reserve;
std::mutex reserve_mutex;
std::mutex work_mutex;
- std::condition_variable work_cv;
+ std::condition_variable_any work_cv;
std::condition_variable wait_cv;
- std::atomic_bool quit{};
+ std::jthread worker_thread;
};
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index aadf03cb0..8972a6921 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -9,6 +9,7 @@
#include "common/assert.h"
#include "common/logging/log.h"
+#include "common/settings.h"
#include "core/core.h"
#include "core/frontend/framebuffer_layout.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -36,8 +37,19 @@ VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats)
VkPresentModeKHR ChooseSwapPresentMode(vk::Span<VkPresentModeKHR> modes) {
// Mailbox doesn't lock the application like fifo (vsync), prefer it
- const auto found = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR);
- return found != modes.end() ? *found : VK_PRESENT_MODE_FIFO_KHR;
+ const auto found_mailbox = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR);
+ if (found_mailbox != modes.end()) {
+ return VK_PRESENT_MODE_MAILBOX_KHR;
+ }
+ if (Settings::values.disable_fps_limit.GetValue()) {
+ // FIFO present mode locks the framerate to the monitor's refresh rate,
+ // Find an alternative to surpass this limitation if FPS is unlocked.
+ const auto found_imm = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_IMMEDIATE_KHR);
+ if (found_imm != modes.end()) {
+ return VK_PRESENT_MODE_IMMEDIATE_KHR;
+ }
+ }
+ return VK_PRESENT_MODE_FIFO_KHR;
}
VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height) {
@@ -143,7 +155,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)};
const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)};
- const VkPresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)};
+ present_mode = ChooseSwapPresentMode(present_modes);
u32 requested_image_count{capabilities.minImageCount + 1};
if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) {
@@ -196,6 +208,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
extent = swapchain_ci.imageExtent;
current_srgb = srgb;
+ current_fps_unlocked = Settings::values.disable_fps_limit.GetValue();
images = swapchain.GetImages();
image_count = static_cast<u32>(images.size());
@@ -248,4 +261,14 @@ void VKSwapchain::Destroy() {
swapchain.reset();
}
+bool VKSwapchain::HasFpsUnlockChanged() const {
+ return current_fps_unlocked != Settings::values.disable_fps_limit.GetValue();
+}
+
+bool VKSwapchain::NeedsPresentModeUpdate() const {
+ // Mailbox present mode is the ideal for all scenarios. If it is not available,
+ // A different present mode is needed to support unlocked FPS above the monitor's refresh rate.
+ return present_mode != VK_PRESENT_MODE_MAILBOX_KHR && HasFpsUnlockChanged();
+}
+
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
index 5bce41e21..61a6d959e 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -33,6 +33,11 @@ public:
/// Presents the rendered image to the swapchain.
void Present(VkSemaphore render_semaphore);
+ /// Returns true when the swapchain needs to be recreated.
+ bool NeedsRecreation(bool is_srgb) const {
+ return HasColorSpaceChanged(is_srgb) || IsSubOptimal() || NeedsPresentModeUpdate();
+ }
+
/// Returns true when the color space has changed.
bool HasColorSpaceChanged(bool is_srgb) const {
return current_srgb != is_srgb;
@@ -84,6 +89,10 @@ private:
void Destroy();
+ bool HasFpsUnlockChanged() const;
+
+ bool NeedsPresentModeUpdate() const;
+
const VkSurfaceKHR surface;
const Device& device;
VKScheduler& scheduler;
@@ -102,8 +111,10 @@ private:
VkFormat image_view_format{};
VkExtent2D extent{};
+ VkPresentModeKHR present_mode{};
bool current_srgb{};
+ bool current_fps_unlocked{};
bool is_outdated{};
bool is_suboptimal{};
};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index ff979a7ac..06c5fb867 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -21,6 +21,7 @@
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
+#include "video_core/texture_cache/util.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -127,7 +128,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format);
VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
if (info.type == ImageType::e2D && info.resources.layers >= 6 &&
- info.size.width == info.size.height) {
+ info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) {
flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
}
if (info.type == ImageType::e3D) {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 6d5a68bfe..b09c468e4 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -4,11 +4,11 @@
#pragma once
-#include <compare>
#include <span>
#include "shader_recompiler/shader_info.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/texture_cache/image_view_base.h"
#include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp
index 81a878bb2..05850afd0 100644
--- a/src/video_core/shader_environment.cpp
+++ b/src/video_core/shader_environment.cpp
@@ -16,6 +16,7 @@
#include "common/fs/fs.h"
#include "common/logging/log.h"
#include "shader_recompiler/environment.h"
+#include "video_core/engines/kepler_compute.h"
#include "video_core/memory_manager.h"
#include "video_core/shader_environment.h"
#include "video_core/textures/texture.h"
diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h
index 2079979db..6640e53d0 100644
--- a/src/video_core/shader_environment.h
+++ b/src/video_core/shader_environment.h
@@ -5,13 +5,13 @@
#pragma once
#include <array>
-#include <atomic>
#include <filesystem>
#include <iosfwd>
#include <limits>
#include <memory>
#include <optional>
#include <span>
+#include <stop_token>
#include <type_traits>
#include <unordered_map>
#include <vector>
@@ -19,9 +19,7 @@
#include "common/common_types.h"
#include "common/unique_function.h"
#include "shader_recompiler/environment.h"
-#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/textures/texture.h"
namespace Tegra {
class Memorymanager;
diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp
index 6527e14c8..e751f26c7 100644
--- a/src/video_core/texture_cache/image_view_info.cpp
+++ b/src/video_core/texture_cache/image_view_info.cpp
@@ -8,6 +8,7 @@
#include "video_core/texture_cache/image_view_info.h"
#include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/texture_cache/types.h"
+#include "video_core/texture_cache/util.h"
#include "video_core/textures/texture.h"
namespace VideoCommon {
diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h
index 74cd3c9d8..50df06409 100644
--- a/src/video_core/texture_cache/slot_vector.h
+++ b/src/video_core/texture_cache/slot_vector.h
@@ -31,8 +31,8 @@ struct SlotId {
};
template <class T>
-requires std::is_nothrow_move_assignable_v<T>&&
- std::is_nothrow_move_constructible_v<T> class SlotVector {
+requires std::is_nothrow_move_assignable_v<T> && std::is_nothrow_move_constructible_v<T>
+class SlotVector {
public:
class Iterator {
friend SlotVector<T>;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 24b809242..329df2e49 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -4,10 +4,15 @@
#pragma once
+#include <unordered_set>
+
#include "common/alignment.h"
#include "video_core/dirty_flags.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/texture_cache/image_view_base.h"
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/texture_cache_base.h"
+#include "video_core/texture_cache/util.h"
namespace VideoCommon {
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index d7528ed24..2d1893c1c 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -4,13 +4,12 @@
#pragma once
-#include <array>
#include <mutex>
#include <span>
#include <type_traits>
#include <unordered_map>
-#include <unordered_set>
#include <vector>
+#include <queue>
#include "common/common_types.h"
#include "common/literals.h"
@@ -18,10 +17,6 @@
#include "video_core/compatible_formats.h"
#include "video_core/delayed_destruction_ring.h"
#include "video_core/engines/fermi_2d.h"
-#include "video_core/engines/kepler_compute.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/descriptor_table.h"
#include "video_core/texture_cache/image_base.h"
@@ -30,7 +25,6 @@
#include "video_core/texture_cache/render_targets.h"
#include "video_core/texture_cache/slot_vector.h"
#include "video_core/texture_cache/types.h"
-#include "video_core/texture_cache/util.h"
#include "video_core/textures/texture.h"
namespace VideoCommon {
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index c2ec9f76a..6388ed2eb 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -588,22 +588,27 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
ext_extended_dynamic_state = false;
}
}
-
sets_per_pool = 64;
- if (driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE) {
+
+ const bool is_amd =
+ driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
+ if (is_amd) {
// AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
sets_per_pool = 96;
- }
-
- const bool is_amd = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
- driver_id == VK_DRIVER_ID_MESA_RADV ||
- driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
- if (ext_sampler_filter_minmax && is_amd) {
- // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+ // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken.
if (!is_float16_supported) {
LOG_WARNING(
Render_Vulkan,
- "Blacklisting AMD GCN4 and lower for VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME");
+ "AMD GCN4 and earlier do not properly support VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT");
+ has_broken_cube_compatibility = true;
+ }
+ }
+ const bool is_amd_or_radv = is_amd || driver_id == VK_DRIVER_ID_MESA_RADV;
+ if (ext_sampler_filter_minmax && is_amd_or_radv) {
+ // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+ if (!is_float16_supported) {
+ LOG_WARNING(Render_Vulkan,
+ "Blacklisting AMD GCN4 and earlier for VK_EXT_sampler_filter_minmax");
ext_sampler_filter_minmax = false;
}
}
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index bc180a32a..d9e74f1aa 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -309,6 +309,11 @@ public:
return has_renderdoc || has_nsight_graphics;
}
+ /// Returns true when the device does not properly support cube compatibility.
+ bool HasBrokenCubeImageCompability() const {
+ return has_broken_cube_compatibility;
+ }
+
/// Returns the vendor name reported from Vulkan.
std::string_view GetVendorName() const {
return vendor_name;
@@ -417,6 +422,7 @@ private:
bool ext_conservative_rasterization{}; ///< Support for VK_EXT_conservative_rasterization.
bool ext_provoking_vertex{}; ///< Support for VK_EXT_provoking_vertex.
bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config.
+ bool has_broken_cube_compatibility{}; ///< Has broken cube compatiblity bit
bool has_renderdoc{}; ///< Has RenderDoc attached
bool has_nsight_graphics{}; ///< Has Nsight Graphics attached