summaryrefslogtreecommitdiffstats
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/buffer_cache/buffer_base.h19
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h292
-rw-r--r--src/video_core/dma_pusher.cpp10
-rw-r--r--src/video_core/engines/maxwell_dma.cpp36
-rw-r--r--src/video_core/engines/maxwell_dma.h17
-rw-r--r--src/video_core/fence_manager.h7
-rw-r--r--src/video_core/gpu.cpp5
-rw-r--r--src/video_core/rasterizer_interface.h8
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp21
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h14
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp21
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h14
-rw-r--r--src/video_core/texture_cache/types.h4
13 files changed, 377 insertions, 91 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index b121d36a3..c3318095c 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -226,19 +226,24 @@ public:
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
template <typename Func>
void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
- ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func);
+ ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func);
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func>
- void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) {
- ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func);
+ void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) {
+ ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func);
+ }
+
+ template <typename Func>
+ void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) {
+ ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func);
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func>
void ForEachDownloadRange(Func&& func) {
- ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func);
+ ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func);
}
/// Mark buffer as picked
@@ -415,7 +420,7 @@ private:
* @param func Function to call for each turned off region
*/
template <Type type, typename Func>
- void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
+ void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) {
static_assert(type != Type::Untracked);
const s64 difference = query_cpu_range - cpu_addr;
@@ -467,7 +472,9 @@ private:
bits = (bits << left_offset) >> left_offset;
const u64 current_word = state_words[word_index] & bits;
- state_words[word_index] &= ~bits;
+ if (clear) {
+ state_words[word_index] &= ~bits;
+ }
if constexpr (type == Type::CPU) {
const u64 current_bits = untracked_words[word_index] & bits;
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index cad7f902d..2871682f6 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -15,6 +15,7 @@
#include <vector>
#include <boost/container/small_vector.hpp>
+#include <boost/icl/interval_set.hpp>
#include "common/common_types.h"
#include "common/div_ceil.h"
@@ -77,6 +78,9 @@ class BufferCache {
using Runtime = typename P::Runtime;
using Buffer = typename P::Buffer;
+ using IntervalSet = boost::icl::interval_set<VAddr>;
+ using IntervalType = typename IntervalSet::interval_type;
+
struct Empty {};
struct OverlapResult {
@@ -148,18 +152,26 @@ public:
/// Return true when there are uncommitted buffers to be downloaded
[[nodiscard]] bool HasUncommittedFlushes() const noexcept;
+ void AccumulateFlushes();
+
/// Return true when the caller should wait for async downloads
[[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
/// Commit asynchronous downloads
void CommitAsyncFlushes();
+ void CommitAsyncFlushesHigh();
/// Pop asynchronous downloads
void PopAsyncFlushes();
+ [[nodiscard]] bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount);
+
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+ /// Return true when a CPU region is modified from the CPU
+ [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
+
std::mutex mutex;
private:
@@ -190,6 +202,36 @@ private:
}
}
+ template <typename Func>
+ void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) {
+ const VAddr start_address = cpu_addr;
+ const VAddr end_address = start_address + size;
+ const VAddr search_base =
+ static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size)));
+ const IntervalType search_interval{search_base, search_base + 1};
+ auto it = common_ranges.lower_bound(search_interval);
+ if (it == common_ranges.end()) {
+ it = common_ranges.begin();
+ }
+ for (; it != common_ranges.end(); it++) {
+ VAddr inter_addr_end = it->upper();
+ VAddr inter_addr = it->lower();
+ if (inter_addr >= end_address) {
+ break;
+ }
+ if (inter_addr_end <= start_address) {
+ continue;
+ }
+ if (inter_addr_end > end_address) {
+ inter_addr_end = end_address;
+ }
+ if (inter_addr < start_address) {
+ inter_addr = start_address;
+ }
+ func(inter_addr, inter_addr_end);
+ }
+ }
+
static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
@@ -272,8 +314,6 @@ private:
void DeleteBuffer(BufferId buffer_id);
- void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
-
void NotifyBufferDeletion();
[[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
@@ -327,9 +367,9 @@ private:
std::vector<BufferId> cached_write_buffer_ids;
- // TODO: This data structure is not optimal and it should be reworked
- std::vector<BufferId> uncommitted_downloads;
- std::deque<std::vector<BufferId>> committed_downloads;
+ IntervalSet uncommitted_ranges;
+ IntervalSet common_ranges;
+ std::deque<IntervalSet> committed_ranges;
size_t immediate_buffer_capacity = 0;
std::unique_ptr<u8[]> immediate_buffer_alloc;
@@ -352,6 +392,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
// Ensure the first slot is used for the null buffer
void(slot_buffers.insert(runtime, NullBufferParams{}));
deletion_iterator = slot_buffers.end();
+ common_ranges.clear();
}
template <class P>
@@ -422,6 +463,68 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
}
template <class P>
+bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
+ const std::optional<VAddr> cpu_src_address = gpu_memory.GpuToCpuAddress(src_address);
+ const std::optional<VAddr> cpu_dest_address = gpu_memory.GpuToCpuAddress(dest_address);
+ if (!cpu_src_address || !cpu_dest_address) {
+ return false;
+ }
+ const bool source_dirty = IsRegionGpuModified(*cpu_src_address, amount);
+ const bool dest_dirty = IsRegionGpuModified(*cpu_dest_address, amount);
+ if (!source_dirty && !dest_dirty) {
+ return false;
+ }
+
+ const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount};
+ uncommitted_ranges.subtract(subtract_interval);
+ for (auto& interval_set : committed_ranges) {
+ interval_set.subtract(subtract_interval);
+ }
+
+ BufferId buffer_a;
+ BufferId buffer_b;
+ do {
+ has_deleted_buffers = false;
+ buffer_a = FindBuffer(*cpu_src_address, static_cast<u32>(amount));
+ buffer_b = FindBuffer(*cpu_dest_address, static_cast<u32>(amount));
+ } while (has_deleted_buffers);
+ auto& src_buffer = slot_buffers[buffer_a];
+ auto& dest_buffer = slot_buffers[buffer_b];
+ SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast<u32>(amount));
+ SynchronizeBuffer(dest_buffer, *cpu_dest_address, static_cast<u32>(amount));
+ std::array copies{BufferCopy{
+ .src_offset = src_buffer.Offset(*cpu_src_address),
+ .dst_offset = dest_buffer.Offset(*cpu_dest_address),
+ .size = amount,
+ }};
+
+ boost::container::small_vector<IntervalType, 4> tmp_intervals;
+ auto mirror = [&](VAddr base_address, VAddr base_address_end) {
+ const u64 size = base_address_end - base_address;
+ const VAddr diff = base_address - *cpu_src_address;
+ const VAddr new_base_address = *cpu_dest_address + diff;
+ const IntervalType add_interval{new_base_address, new_base_address + size};
+ uncommitted_ranges.add(add_interval);
+ tmp_intervals.push_back(add_interval);
+ };
+ ForEachWrittenRange(*cpu_src_address, amount, mirror);
+ // This subtraction in this order is important for overlapping copies.
+ common_ranges.subtract(subtract_interval);
+ for (const IntervalType add_interval : tmp_intervals) {
+ common_ranges.add(add_interval);
+ }
+
+ runtime.CopyBuffer(dest_buffer, src_buffer, copies);
+ if (source_dirty) {
+ dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
+ }
+ std::vector<u8> tmp_buffer(amount);
+ cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount);
+ cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount);
+ return true;
+}
+
+template <class P>
void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) {
const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
@@ -547,29 +650,30 @@ void BufferCache<P>::FlushCachedWrites() {
template <class P>
bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
- return !uncommitted_downloads.empty();
+ return !uncommitted_ranges.empty() || !committed_ranges.empty();
}
template <class P>
-bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
- return !committed_downloads.empty() && !committed_downloads.front().empty();
+void BufferCache<P>::AccumulateFlushes() {
+ if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
+ uncommitted_ranges.clear();
+ return;
+ }
+ if (uncommitted_ranges.empty()) {
+ return;
+ }
+ committed_ranges.emplace_back(std::move(uncommitted_ranges));
}
template <class P>
-void BufferCache<P>::CommitAsyncFlushes() {
- // This is intentionally passing the value by copy
- committed_downloads.push_front(uncommitted_downloads);
- uncommitted_downloads.clear();
+bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
+ return false;
}
template <class P>
-void BufferCache<P>::PopAsyncFlushes() {
- if (committed_downloads.empty()) {
- return;
- }
- auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
- const std::span<const BufferId> download_ids = committed_downloads.back();
- if (download_ids.empty()) {
+void BufferCache<P>::CommitAsyncFlushesHigh() {
+ AccumulateFlushes();
+ if (committed_ranges.empty()) {
return;
}
MICROPROFILE_SCOPE(GPU_DownloadMemory);
@@ -577,20 +681,43 @@ void BufferCache<P>::PopAsyncFlushes() {
boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
u64 total_size_bytes = 0;
u64 largest_copy = 0;
- for (const BufferId buffer_id : download_ids) {
- slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
- downloads.push_back({
- BufferCopy{
- .src_offset = range_offset,
- .dst_offset = total_size_bytes,
- .size = range_size,
- },
- buffer_id,
+ for (const IntervalSet& intervals : committed_ranges) {
+ for (auto& interval : intervals) {
+ const std::size_t size = interval.upper() - interval.lower();
+ const VAddr cpu_addr = interval.lower();
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+ boost::container::small_vector<BufferCopy, 1> copies;
+ buffer.ForEachDownloadRangeAndClear(
+ cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ const VAddr buffer_addr = buffer.CpuAddr();
+ const auto add_download = [&](VAddr start, VAddr end) {
+ const u64 new_offset = start - buffer_addr;
+ const u64 new_size = end - start;
+ downloads.push_back({
+ BufferCopy{
+ .src_offset = new_offset,
+ .dst_offset = total_size_bytes,
+ .size = new_size,
+ },
+ buffer_id,
+ });
+ // Align up to avoid cache conflicts
+ constexpr u64 align = 256ULL;
+ constexpr u64 mask = ~(align - 1ULL);
+ total_size_bytes += (new_size + align - 1) & mask;
+ largest_copy = std::max(largest_copy, new_size);
+ };
+
+ const VAddr start_address = buffer_addr + range_offset;
+ const VAddr end_address = start_address + range_size;
+ ForEachWrittenRange(start_address, range_size, add_download);
+ const IntervalType subtract_interval{start_address, end_address};
+ common_ranges.subtract(subtract_interval);
+ });
});
- total_size_bytes += range_size;
- largest_copy = std::max(largest_copy, range_size);
- });
+ }
}
+ committed_ranges.clear();
if (downloads.empty()) {
return;
}
@@ -623,6 +750,19 @@ void BufferCache<P>::PopAsyncFlushes() {
}
template <class P>
+void BufferCache<P>::CommitAsyncFlushes() {
+ if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
+ CommitAsyncFlushesHigh();
+ } else {
+ uncommitted_ranges.clear();
+ committed_ranges.clear();
+ }
+}
+
+template <class P>
+void BufferCache<P>::PopAsyncFlushes() {}
+
+template <class P>
bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
for (u64 page = addr >> PAGE_BITS; page < page_end;) {
@@ -642,6 +782,25 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
}
template <class P>
+bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
+ const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
+ for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+ const BufferId image_id = page_table[page];
+ if (!image_id) {
+ ++page;
+ continue;
+ }
+ Buffer& buffer = slot_buffers[image_id];
+ if (buffer.IsRegionCpuModified(addr, size)) {
+ return true;
+ }
+ const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
+ }
+ return false;
+}
+
+template <class P>
void BufferCache<P>::BindHostIndexBuffer() {
Buffer& buffer = slot_buffers[index_buffer.buffer_id];
TouchBuffer(buffer);
@@ -649,7 +808,9 @@ void BufferCache<P>::BindHostIndexBuffer() {
const u32 size = index_buffer.size;
SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
- runtime.BindIndexBuffer(buffer, offset, size);
+ const u32 new_offset = offset + maxwell3d.regs.index_array.first *
+ maxwell3d.regs.index_array.FormatSizeInBytes();
+ runtime.BindIndexBuffer(buffer, new_offset, size);
} else {
runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
@@ -863,7 +1024,7 @@ void BufferCache<P>::UpdateIndexBuffer() {
const GPUVAddr gpu_addr_end = index_array.EndAddress();
const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
- const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
+ const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes();
const u32 size = std::min(address_size, draw_size);
if (size == 0 || !cpu_addr) {
index_buffer = NULL_BINDING;
@@ -1010,16 +1171,16 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
Buffer& buffer = slot_buffers[buffer_id];
buffer.MarkRegionAsGpuModified(cpu_addr, size);
- const bool is_accuracy_high = Settings::IsGPULevelHigh();
+ const IntervalType base_interval{cpu_addr, cpu_addr + size};
+ common_ranges.add(base_interval);
+
+ const bool is_accuracy_high =
+ Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
- if (!is_accuracy_high || !is_async) {
- return;
- }
- if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
- // Already inserted
+ if (!is_async && !is_accuracy_high) {
return;
}
- uncommitted_downloads.push_back(buffer_id);
+ uncommitted_ranges.add(base_interval);
}
template <class P>
@@ -1103,7 +1264,6 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
if (!copies.empty()) {
runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
}
- ReplaceBufferDownloads(overlap_id, new_buffer_id);
DeleteBuffer(overlap_id);
}
@@ -1244,14 +1404,28 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
boost::container::small_vector<BufferCopy, 1> copies;
u64 total_size_bytes = 0;
u64 largest_copy = 0;
- buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
- copies.push_back(BufferCopy{
- .src_offset = range_offset,
- .dst_offset = total_size_bytes,
- .size = range_size,
- });
- total_size_bytes += range_size;
- largest_copy = std::max(largest_copy, range_size);
+ buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ const VAddr buffer_addr = buffer.CpuAddr();
+ const auto add_download = [&](VAddr start, VAddr end) {
+ const u64 new_offset = start - buffer_addr;
+ const u64 new_size = end - start;
+ copies.push_back(BufferCopy{
+ .src_offset = new_offset,
+ .dst_offset = total_size_bytes,
+ .size = new_size,
+ });
+ // Align up to avoid cache conflicts
+ constexpr u64 align = 256ULL;
+ constexpr u64 mask = ~(align - 1ULL);
+ total_size_bytes += (new_size + align - 1) & mask;
+ largest_copy = std::max(largest_copy, new_size);
+ };
+
+ const VAddr start_address = buffer_addr + range_offset;
+ const VAddr end_address = start_address + range_size;
+ ForEachWrittenRange(start_address, range_size, add_download);
+ const IntervalType subtract_interval{start_address, end_address};
+ common_ranges.subtract(subtract_interval);
});
if (total_size_bytes == 0) {
return;
@@ -1316,18 +1490,6 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
}
template <class P>
-void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
- const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
- std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
- if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
- buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
- }
- };
- replace(uncommitted_downloads);
- std::ranges::for_each(committed_downloads, replace);
-}
-
-template <class P>
void BufferCache<P>::NotifyBufferDeletion() {
if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
dirty_uniform_buffers.fill(~u32{0});
@@ -1349,15 +1511,9 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
if (!cpu_addr || size == 0) {
return NULL_BINDING;
}
- // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
- // It exists due to some games like Astral Chain operate out of bounds.
- // Binding the whole map range would be technically correct, but games have large maps that make
- // this approach unaffordable for now.
- static constexpr u32 arbitrary_extra_bytes = 0xc000;
- const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
const Binding binding{
.cpu_addr = *cpu_addr,
- .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end),
+ .size = size,
.buffer_id = BufferId{},
};
return binding;
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 8b33c04ab..8d28bd884 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -4,6 +4,7 @@
#include "common/cityhash.h"
#include "common/microprofile.h"
+#include "common/settings.h"
#include "core/core.h"
#include "core/memory.h"
#include "video_core/dma_pusher.h"
@@ -76,8 +77,13 @@ bool DmaPusher::Step() {
// Push buffer non-empty, read a word
command_headers.resize(command_list_header.size);
- gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
- command_list_header.size * sizeof(u32));
+ if (Settings::IsGPULevelHigh()) {
+ gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
+ command_list_header.size * sizeof(u32));
+ } else {
+ gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+ command_list_header.size * sizeof(u32));
+ }
}
for (std::size_t index = 0; index < command_headers.size();) {
const CommandHeader& command_header = command_headers[index];
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 2ee980bab..24481952b 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -21,6 +21,10 @@ MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
MaxwellDMA::~MaxwellDMA() = default;
+void MaxwellDMA::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+ rasterizer = rasterizer_;
+}
+
void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
@@ -44,7 +48,6 @@ void MaxwellDMA::Launch() {
// TODO(Subv): Perform more research and implement all features of this engine.
const LaunchDMA& launch = regs.launch_dma;
- ASSERT(launch.remap_enable == 0);
ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
@@ -77,11 +80,29 @@ void MaxwellDMA::CopyPitchToPitch() {
// When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
// buffer of length `line_length_in`.
// Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
+ auto& accelerate = rasterizer->AccessAccelerateDMA();
if (!regs.launch_dma.multi_line_enable) {
- memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in);
+ const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 &&
+ regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
+ // TODO: allow multisized components.
+ if (is_buffer_clear) {
+ ASSERT(regs.remap_const.component_size_minus_one == 3);
+ std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
+ memory_manager.WriteBlock(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()),
+ regs.line_length_in * sizeof(u32));
+ return;
+ }
+ UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
+ if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
+ std::vector<u8> tmp_buffer(regs.line_length_in);
+ memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
+ memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
+ }
return;
}
+ UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
+
// Perform a line-by-line copy.
// We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
// There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
@@ -105,6 +126,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
}
// Deswizzle the input and copy it over.
+ UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
const Parameters& src_params = regs.src_params;
const u32 width = src_params.width;
@@ -134,6 +156,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
void MaxwellDMA::CopyPitchToBlockLinear() {
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
+ UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
const auto& dst_params = regs.dst_params;
const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
@@ -156,13 +179,8 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
write_buffer.resize(dst_size);
}
- if (Settings::IsGPULevelExtreme()) {
- memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
- memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
- } else {
- memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
- memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
- }
+ memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+ memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
// If the input is linear and the output is tiled, swizzle the input and copy it over.
if (regs.dst_params.block_size.depth > 0) {
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c77f02a22..4ed0d0996 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -21,8 +21,18 @@ namespace Tegra {
class MemoryManager;
}
+namespace VideoCore {
+class RasterizerInterface;
+}
+
namespace Tegra::Engines {
+class AccelerateDMAInterface {
+public:
+ /// Write the value to the register identified by method.
+ virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0;
+};
+
/**
* This engine is known as gk104_copy. Documentation can be found in:
* https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
@@ -187,6 +197,8 @@ public:
};
static_assert(sizeof(RemapConst) == 12);
+ void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
+
explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
~MaxwellDMA() override;
@@ -213,6 +225,7 @@ private:
Core::System& system;
MemoryManager& memory_manager;
+ VideoCore::RasterizerInterface* rasterizer;
std::vector<u8> read_buffer;
std::vector<u8> write_buffer;
@@ -240,7 +253,9 @@ private:
u32 pitch_out;
u32 line_length_in;
u32 line_count;
- u32 reserved06[0xb8];
+ u32 reserved06[0xb6];
+ u32 remap_consta_value;
+ u32 remap_constb_value;
RemapConst remap_const;
Parameters dst_params;
u32 reserved07[0x1];
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index f055b61e9..34dc6c596 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -8,6 +8,7 @@
#include <queue>
#include "common/common_types.h"
+#include "common/settings.h"
#include "core/core.h"
#include "video_core/delayed_destruction_ring.h"
#include "video_core/gpu.h"
@@ -53,6 +54,12 @@ public:
delayed_destruction_ring.Tick();
}
+ // Unlike other fences, this one doesn't
+ void SignalOrdering() {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.AccumulateFlushes();
+ }
+
void SignalSemaphore(GPUVAddr addr, u32 value) {
TryReleasePendingFences();
const bool should_flush = ShouldFlush();
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 35cc561be..ff024f530 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -50,6 +50,7 @@ void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
maxwell_3d->BindRasterizer(rasterizer);
fermi_2d->BindRasterizer(rasterizer);
kepler_compute->BindRasterizer(rasterizer);
+ maxwell_dma->BindRasterizer(rasterizer);
}
Engines::Maxwell3D& GPU::Maxwell3D() {
@@ -268,11 +269,13 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
case BufferMethods::SemaphoreAddressHigh:
case BufferMethods::SemaphoreAddressLow:
case BufferMethods::SemaphoreSequence:
- case BufferMethods::RefCnt:
case BufferMethods::UnkCacheFlush:
case BufferMethods::WrcacheFlush:
case BufferMethods::FenceValue:
break;
+ case BufferMethods::RefCnt:
+ rasterizer->SignalReference();
+ break;
case BufferMethods::FenceAction:
ProcessFenceActionMethod();
break;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 0cec4225b..58014c1c3 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -15,7 +15,10 @@
namespace Tegra {
class MemoryManager;
+namespace Engines {
+class AccelerateDMAInterface;
}
+} // namespace Tegra
namespace VideoCore {
@@ -63,6 +66,9 @@ public:
/// Signal a GPU based syncpoint as a fence
virtual void SignalSyncPoint(u32 value) = 0;
+ /// Signal a GPU based reference as point
+ virtual void SignalReference() = 0;
+
/// Release all pending fences.
virtual void ReleaseFences() = 0;
@@ -116,6 +122,8 @@ public:
return false;
}
+ [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
+
/// Attempt to use a faster method to display the framebuffer to screen
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 07ad0e205..82c84127a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -171,7 +171,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache_runtime(device),
buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
- query_cache(*this, maxwell3d, gpu_memory),
+ query_cache(*this, maxwell3d, gpu_memory), accelerate_dma(buffer_cache),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
async_shaders(emu_window_) {
if (device.UseAsynchronousShaders()) {
@@ -634,6 +634,13 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) {
fence_manager.SignalSyncPoint(value);
}
+void RasterizerOpenGL::SignalReference() {
+ if (!gpu.IsAsync()) {
+ return;
+ }
+ fence_manager.SignalOrdering();
+}
+
void RasterizerOpenGL::ReleaseFences() {
if (!gpu.IsAsync()) {
return;
@@ -650,6 +657,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
void RasterizerOpenGL::WaitForIdle() {
glMemoryBarrier(GL_ALL_BARRIER_BITS);
+ SignalReference();
}
void RasterizerOpenGL::FragmentBarrier() {
@@ -693,6 +701,10 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
return true;
}
+Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() {
+ return accelerate_dma;
+}
+
bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
if (framebuffer_addr == 0) {
@@ -1388,4 +1400,11 @@ void RasterizerOpenGL::EndTransformFeedback() {
glEndTransformFeedback();
}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+
+bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
+ std::scoped_lock lock{buffer_cache.mutex};
+ return buffer_cache.DMACopy(src_address, dest_address, amount);
+}
+
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 482efed7a..ccee9ba33 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,6 +19,7 @@
#include "common/common_types.h"
#include "video_core/engines/const_buffer_info.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
#include "video_core/rasterizer_accelerated.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -58,6 +59,16 @@ struct BindlessSSBO {
};
static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
+class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
+public:
+ explicit AccelerateDMA(BufferCache& buffer_cache);
+
+ bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override;
+
+private:
+ BufferCache& buffer_cache;
+};
+
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public:
explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
@@ -83,6 +94,7 @@ public:
void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
void SignalSemaphore(GPUVAddr addr, u32 value) override;
void SignalSyncPoint(u32 value) override;
+ void SignalReference() override;
void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitForIdle() override;
@@ -93,6 +105,7 @@ public:
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) override;
+ Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
@@ -233,6 +246,7 @@ private:
BufferCache buffer_cache;
ShaderCacheOpenGL shader_cache;
QueryCache query_cache;
+ AccelerateDMA accelerate_dma;
FenceManagerOpenGL fence_manager;
VideoCommon::Shader::AsyncShaders async_shaders;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index bd4d649cc..e378a5679 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -251,7 +251,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
descriptor_pool, update_descriptor_queue),
- query_cache{*this, maxwell3d, gpu_memory, device, scheduler},
+ query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, accelerate_dma{buffer_cache},
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) {
scheduler.SetQueryCache(query_cache);
@@ -580,6 +580,13 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) {
fence_manager.SignalSyncPoint(value);
}
+void RasterizerVulkan::SignalReference() {
+ if (!gpu.IsAsync()) {
+ return;
+ }
+ fence_manager.SignalOrdering();
+}
+
void RasterizerVulkan::ReleaseFences() {
if (!gpu.IsAsync()) {
return;
@@ -612,6 +619,7 @@ void RasterizerVulkan::WaitForIdle() {
cmdbuf.SetEvent(event, flags);
cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {});
});
+ SignalReference();
}
void RasterizerVulkan::FragmentBarrier() {
@@ -652,6 +660,10 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
return true;
}
+Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() {
+ return accelerate_dma;
+}
+
bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
if (!framebuffer_addr) {
@@ -690,6 +702,13 @@ void RasterizerVulkan::FlushWork() {
draw_counter = 0;
}
+AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {}
+
+bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
+ std::scoped_lock lock{buffer_cache.mutex};
+ return buffer_cache.DMACopy(src_address, dest_address, amount);
+}
+
void RasterizerVulkan::SetupShaderDescriptors(
const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, bool is_indexed) {
image_view_indices.clear();
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 41459c5c5..3a78de258 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -13,6 +13,7 @@
#include <boost/container/static_vector.hpp>
#include "common/common_types.h"
+#include "video_core/engines/maxwell_dma.h"
#include "video_core/rasterizer_accelerated.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_vulkan/blit_image.h"
@@ -49,6 +50,16 @@ struct VKScreenInfo;
class StateTracker;
+class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface {
+public:
+ explicit AccelerateDMA(BufferCache& buffer_cache);
+
+ bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override;
+
+private:
+ BufferCache& buffer_cache;
+};
+
class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
public:
explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
@@ -75,6 +86,7 @@ public:
void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
void SignalSemaphore(GPUVAddr addr, u32 value) override;
void SignalSyncPoint(u32 value) override;
+ void SignalReference() override;
void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitForIdle() override;
@@ -85,6 +97,7 @@ public:
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) override;
+ Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
@@ -185,6 +198,7 @@ private:
BufferCache buffer_cache;
VKPipelineCache pipeline_cache;
VKQueryCache query_cache;
+ AccelerateDMA accelerate_dma;
VKFenceManager fence_manager;
vk::Event wfi_event;
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 9fbdc1ac6..47a11cb2f 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -133,8 +133,8 @@ struct BufferImageCopy {
};
struct BufferCopy {
- size_t src_offset;
- size_t dst_offset;
+ u64 src_offset;
+ u64 dst_offset;
size_t size;
};