diff options
Diffstat (limited to 'src/video_core')
278 files changed, 21565 insertions, 14923 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d85f1e9d1..9b931976a 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,12 +1,30 @@ add_subdirectory(host_shaders) add_library(video_core STATIC - buffer_cache/buffer_block.h + buffer_cache/buffer_base.h + buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h - buffer_cache/map_interval.cpp - buffer_cache/map_interval.h + cdma_pusher.cpp + cdma_pusher.h + command_classes/codecs/codec.cpp + command_classes/codecs/codec.h + command_classes/codecs/h264.cpp + command_classes/codecs/h264.h + command_classes/codecs/vp9.cpp + command_classes/codecs/vp9.h + command_classes/codecs/vp9_types.h + command_classes/host1x.cpp + command_classes/host1x.h + command_classes/nvdec.cpp + command_classes/nvdec.h + command_classes/nvdec_common.h + command_classes/sync_manager.cpp + command_classes/sync_manager.h + command_classes/vic.cpp + command_classes/vic.h compatible_formats.cpp compatible_formats.h + delayed_destruction_ring.h dirty_flags.cpp dirty_flags.h dma_pusher.cpp @@ -29,6 +47,7 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + framebuffer_config.h macro/macro.cpp macro/macro.h macro/macro_hle.cpp @@ -40,18 +59,12 @@ add_library(video_core STATIC fence_manager.h gpu.cpp gpu.h - gpu_asynch.cpp - gpu_asynch.h - gpu_synch.cpp - gpu_synch.h gpu_thread.cpp gpu_thread.h guest_driver.cpp guest_driver.h memory_manager.cpp memory_manager.h - morton.cpp - morton.h query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h @@ -66,14 +79,10 @@ add_library(video_core STATIC renderer_opengl/gl_device.h renderer_opengl/gl_fence_manager.cpp renderer_opengl/gl_fence_manager.h - renderer_opengl/gl_framebuffer_cache.cpp - renderer_opengl/gl_framebuffer_cache.h renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer.h renderer_opengl/gl_resource_manager.cpp renderer_opengl/gl_resource_manager.h - renderer_opengl/gl_sampler_cache.cpp - renderer_opengl/gl_sampler_cache.h renderer_opengl/gl_shader_cache.cpp renderer_opengl/gl_shader_cache.h renderer_opengl/gl_shader_decompiler.cpp @@ -95,10 +104,58 @@ add_library(video_core STATIC renderer_opengl/maxwell_to_gl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h - renderer_opengl/utils.cpp - renderer_opengl/utils.h - sampler_cache.cpp - sampler_cache.h + renderer_opengl/util_shaders.cpp + renderer_opengl/util_shaders.h + renderer_vulkan/blit_image.cpp + renderer_vulkan/blit_image.h + renderer_vulkan/fixed_pipeline_state.cpp + renderer_vulkan/fixed_pipeline_state.h + renderer_vulkan/maxwell_to_vk.cpp + renderer_vulkan/maxwell_to_vk.h + renderer_vulkan/renderer_vulkan.h + renderer_vulkan/renderer_vulkan.cpp + renderer_vulkan/vk_blit_screen.cpp + renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache.cpp + renderer_vulkan/vk_buffer_cache.h + renderer_vulkan/vk_command_pool.cpp + renderer_vulkan/vk_command_pool.h + renderer_vulkan/vk_compute_pass.cpp + renderer_vulkan/vk_compute_pass.h + renderer_vulkan/vk_compute_pipeline.cpp + renderer_vulkan/vk_compute_pipeline.h + renderer_vulkan/vk_descriptor_pool.cpp + renderer_vulkan/vk_descriptor_pool.h + renderer_vulkan/vk_fence_manager.cpp + renderer_vulkan/vk_fence_manager.h + renderer_vulkan/vk_graphics_pipeline.cpp + renderer_vulkan/vk_graphics_pipeline.h + renderer_vulkan/vk_master_semaphore.cpp + renderer_vulkan/vk_master_semaphore.h + renderer_vulkan/vk_pipeline_cache.cpp + renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_query_cache.cpp + renderer_vulkan/vk_query_cache.h + renderer_vulkan/vk_rasterizer.cpp + renderer_vulkan/vk_rasterizer.h + renderer_vulkan/vk_resource_pool.cpp + renderer_vulkan/vk_resource_pool.h + renderer_vulkan/vk_scheduler.cpp + renderer_vulkan/vk_scheduler.h + renderer_vulkan/vk_shader_decompiler.cpp + renderer_vulkan/vk_shader_decompiler.h + renderer_vulkan/vk_shader_util.cpp + renderer_vulkan/vk_shader_util.h + renderer_vulkan/vk_staging_buffer_pool.cpp + renderer_vulkan/vk_staging_buffer_pool.h + renderer_vulkan/vk_state_tracker.cpp + renderer_vulkan/vk_state_tracker.h + renderer_vulkan/vk_swapchain.cpp + renderer_vulkan/vk_swapchain.h + renderer_vulkan/vk_texture_cache.cpp + renderer_vulkan/vk_texture_cache.h + renderer_vulkan/vk_update_descriptor.cpp + renderer_vulkan/vk_update_descriptor.h shader_cache.h shader_notify.cpp shader_notify.h @@ -155,119 +212,104 @@ add_library(video_core STATIC shader/transform_feedback.h surface.cpp surface.h + texture_cache/accelerated_swizzle.cpp + texture_cache/accelerated_swizzle.h + texture_cache/decode_bc4.cpp + texture_cache/decode_bc4.h + texture_cache/descriptor_table.h + texture_cache/formatter.cpp + texture_cache/formatter.h texture_cache/format_lookup_table.cpp texture_cache/format_lookup_table.h - texture_cache/surface_base.cpp - texture_cache/surface_base.h - texture_cache/surface_params.cpp - texture_cache/surface_params.h - texture_cache/surface_view.cpp - texture_cache/surface_view.h + texture_cache/image_base.cpp + texture_cache/image_base.h + texture_cache/image_info.cpp + texture_cache/image_info.h + texture_cache/image_view_base.cpp + texture_cache/image_view_base.h + texture_cache/image_view_info.cpp + texture_cache/image_view_info.h + texture_cache/render_targets.h + texture_cache/samples_helper.h + texture_cache/slot_vector.h texture_cache/texture_cache.h + texture_cache/types.h + texture_cache/util.cpp + texture_cache/util.h textures/astc.cpp textures/astc.h - textures/convert.cpp - textures/convert.h textures/decoders.cpp textures/decoders.h textures/texture.cpp textures/texture.h video_core.cpp video_core.h + vulkan_common/vulkan_debug_callback.cpp + vulkan_common/vulkan_debug_callback.h + vulkan_common/vulkan_device.cpp + vulkan_common/vulkan_device.h + vulkan_common/vulkan_instance.cpp + vulkan_common/vulkan_instance.h + vulkan_common/vulkan_library.cpp + vulkan_common/vulkan_library.h + vulkan_common/vulkan_memory_allocator.cpp + vulkan_common/vulkan_memory_allocator.h + vulkan_common/vulkan_surface.cpp + vulkan_common/vulkan_surface.h + vulkan_common/vulkan_wrapper.cpp + vulkan_common/vulkan_wrapper.h + vulkan_common/nsight_aftermath_tracker.cpp + vulkan_common/nsight_aftermath_tracker.h ) -if (ENABLE_VULKAN) - target_sources(video_core PRIVATE - renderer_vulkan/fixed_pipeline_state.cpp - renderer_vulkan/fixed_pipeline_state.h - renderer_vulkan/maxwell_to_vk.cpp - renderer_vulkan/maxwell_to_vk.h - renderer_vulkan/nsight_aftermath_tracker.cpp - renderer_vulkan/nsight_aftermath_tracker.h - renderer_vulkan/renderer_vulkan.h - renderer_vulkan/renderer_vulkan.cpp - renderer_vulkan/vk_blit_screen.cpp - renderer_vulkan/vk_blit_screen.h - renderer_vulkan/vk_buffer_cache.cpp - renderer_vulkan/vk_buffer_cache.h - renderer_vulkan/vk_compute_pass.cpp - renderer_vulkan/vk_compute_pass.h - renderer_vulkan/vk_compute_pipeline.cpp - renderer_vulkan/vk_compute_pipeline.h - renderer_vulkan/vk_descriptor_pool.cpp - renderer_vulkan/vk_descriptor_pool.h - renderer_vulkan/vk_device.cpp - renderer_vulkan/vk_device.h - renderer_vulkan/vk_fence_manager.cpp - renderer_vulkan/vk_fence_manager.h - renderer_vulkan/vk_graphics_pipeline.cpp - renderer_vulkan/vk_graphics_pipeline.h - renderer_vulkan/vk_image.cpp - renderer_vulkan/vk_image.h - renderer_vulkan/vk_memory_manager.cpp - renderer_vulkan/vk_memory_manager.h - renderer_vulkan/vk_pipeline_cache.cpp - renderer_vulkan/vk_pipeline_cache.h - renderer_vulkan/vk_query_cache.cpp - renderer_vulkan/vk_query_cache.h - renderer_vulkan/vk_rasterizer.cpp - renderer_vulkan/vk_rasterizer.h - renderer_vulkan/vk_renderpass_cache.cpp - renderer_vulkan/vk_renderpass_cache.h - renderer_vulkan/vk_resource_manager.cpp - renderer_vulkan/vk_resource_manager.h - renderer_vulkan/vk_sampler_cache.cpp - renderer_vulkan/vk_sampler_cache.h - renderer_vulkan/vk_scheduler.cpp - renderer_vulkan/vk_scheduler.h - renderer_vulkan/vk_shader_decompiler.cpp - renderer_vulkan/vk_shader_decompiler.h - renderer_vulkan/vk_shader_util.cpp - renderer_vulkan/vk_shader_util.h - renderer_vulkan/vk_staging_buffer_pool.cpp - renderer_vulkan/vk_staging_buffer_pool.h - renderer_vulkan/vk_state_tracker.cpp - renderer_vulkan/vk_state_tracker.h - renderer_vulkan/vk_stream_buffer.cpp - renderer_vulkan/vk_stream_buffer.h - renderer_vulkan/vk_swapchain.cpp - renderer_vulkan/vk_swapchain.h - renderer_vulkan/vk_texture_cache.cpp - renderer_vulkan/vk_texture_cache.h - renderer_vulkan/vk_update_descriptor.cpp - renderer_vulkan/vk_update_descriptor.h - renderer_vulkan/wrapper.cpp - renderer_vulkan/wrapper.h - ) -endif() - create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) target_link_libraries(video_core PRIVATE glad xbyak) +if (YUZU_USE_BUNDLED_FFMPEG AND NOT WIN32) + add_dependencies(video_core ffmpeg-build) +endif() + +target_include_directories(video_core PRIVATE ${FFmpeg_INCLUDE_DIR}) +target_link_libraries(video_core PRIVATE ${FFmpeg_LIBRARIES}) + add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) - -if (ENABLE_VULKAN) - target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) - target_compile_definitions(video_core PRIVATE HAS_VULKAN) - target_link_libraries(video_core PRIVATE sirit) -endif() +target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) +target_link_libraries(video_core PRIVATE sirit) if (ENABLE_NSIGHT_AFTERMATH) if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK}) - message(ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided") + message(FATAL_ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided") endif() if (NOT WIN32) - message(ERROR "Nsight Aftermath doesn't support non-Windows platforms") + message(FATAL_ERROR "Nsight Aftermath doesn't support non-Windows platforms") endif() target_compile_definitions(video_core PRIVATE HAS_NSIGHT_AFTERMATH) target_include_directories(video_core PRIVATE "$ENV{NSIGHT_AFTERMATH_SDK}/include") endif() if (MSVC) - target_compile_options(video_core PRIVATE /we4267) + target_compile_options(video_core PRIVATE + /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data + /we4456 # Declaration of 'identifier' hides previous local declaration + /we4457 # Declaration of 'identifier' hides function parameter + /we4458 # Declaration of 'identifier' hides class member + /we4459 # Declaration of 'identifier' hides global declaration + /we4715 # 'function' : not all control paths return a value + ) else() - target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion) + target_compile_options(video_core PRIVATE + -Werror=conversion + -Wno-error=sign-conversion + -Werror=pessimizing-move + -Werror=redundant-move + -Werror=shadow + -Werror=type-limits + + $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable> + ) endif() diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h new file mode 100644 index 000000000..0c00ae280 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_base.h @@ -0,0 +1,590 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <bit> +#include <limits> +#include <utility> + +#include "common/alignment.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "core/memory.h" + +namespace VideoCommon { + +enum class BufferFlagBits { + Picked = 1 << 0, + CachedWrites = 1 << 1, +}; +DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) + +/// Tag for creating null buffers with no storage or size +struct NullBufferParams {}; + +/** + * Range tracking buffer container. + * + * It keeps track of the modified CPU and GPU ranges on a CPU page granularity, notifying the given + * rasterizer about state changes in the tracking behavior of the buffer. + * + * The buffer size and address is forcefully aligned to CPU page boundaries. + */ +template <class RasterizerInterface> +class BufferBase { + static constexpr u64 PAGES_PER_WORD = 64; + static constexpr u64 BYTES_PER_PAGE = Core::Memory::PAGE_SIZE; + static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + + /// Vector tracking modified pages tightly packed with small vector optimization + union WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? &stack : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? &stack : heap; + } + + u64 stack = 0; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage + }; + + struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + if (IsShort()) { + cpu.stack = ~u64{0}; + gpu.stack = 0; + cached_cpu.stack = 0; + untracked.stack = ~u64{0}; + } else { + // Share allocation between CPU and GPU pages and set their default values + const size_t num_words = NumWords(); + u64* const alloc = new u64[num_words * 4]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + cpu = rhs.cpu; + gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return size_bytes <= BYTES_PER_WORD; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return Common::DivCeil(size_bytes, BYTES_PER_WORD); + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + u64 size_bytes = 0; + WordsArray cpu; + WordsArray gpu; + WordsArray cached_cpu; + WordsArray untracked; + }; + + enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, + }; + +public: + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) + : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, + words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} + + explicit BufferBase(NullBufferParams) {} + + BufferBase& operator=(const BufferBase&) = delete; + BufferBase(const BufferBase&) = delete; + + BufferBase& operator=(BufferBase&&) = default; + BufferBase(BufferBase&&) = default; + + /// Returns the inclusive CPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, + u64 query_size) const noexcept { + const u64 offset = query_cpu_addr - cpu_addr; + return ModifiedRegion<Type::CPU>(offset, query_size); + } + + /// Returns the inclusive GPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, + u64 query_size) const noexcept { + const u64 offset = query_cpu_addr - cpu_addr; + return ModifiedRegion<Type::GPU>(offset, query_size); + } + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { + const u64 offset = query_cpu_addr - cpu_addr; + return IsRegionModified<Type::CPU>(offset, query_size); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { + const u64 offset = query_cpu_addr - cpu_addr; + return IsRegionModified<Type::GPU>(offset, query_size); + } + + /// Mark region as CPU modified, notifying the rasterizer about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { + ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); + } + + /// Unmark region as CPU modified, notifying the rasterizer about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { + ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { + ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { + ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { + flags |= BufferFlagBits::CachedWrites; + ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites() noexcept { + flags &= ~BufferFlagBits::CachedWrites; + const u64 num_words = NumWords(); + const u64* const cached_words = Array<Type::CachedCPU>(); + u64* const untracked_words = Array<Type::Untracked>(); + u64* const cpu_words = Array<Type::CPU>(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + } + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template <typename Func> + void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { + ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template <typename Func> + void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { + ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template <typename Func> + void ForEachDownloadRange(Func&& func) { + ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func); + } + + /// Mark buffer as picked + void Pick() noexcept { + flags |= BufferFlagBits::Picked; + } + + /// Unmark buffer as picked + void Unpick() noexcept { + flags &= ~BufferFlagBits::Picked; + } + + /// Increases the likeliness of this being a stream buffer + void IncreaseStreamScore(int score) noexcept { + stream_score += score; + } + + /// Returns the likeliness of this being a stream buffer + [[nodiscard]] int StreamScore() const noexcept { + return stream_score; + } + + /// Returns true when vaddr -> vaddr+size is fully contained in the buffer + [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { + return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); + } + + /// Returns true if the buffer has been marked as picked + [[nodiscard]] bool IsPicked() const noexcept { + return True(flags & BufferFlagBits::Picked); + } + + /// Returns true when the buffer has pending cached writes + [[nodiscard]] bool HasCachedWrites() const noexcept { + return True(flags & BufferFlagBits::CachedWrites); + } + + /// Returns the base CPU address of the buffer + [[nodiscard]] VAddr CpuAddr() const noexcept { + return cpu_addr; + } + + /// Returns the offset relative to the given CPU address + /// @pre IsInBounds returns true + [[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept { + return static_cast<u32>(other_cpu_addr - cpu_addr); + } + + /// Returns the size in bytes of the buffer + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + +private: + template <Type type> + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template <Type type> + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template <Type type, bool enable> + void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { + const s64 difference = dirty_addr - cpu_addr; + const u64 offset = std::max<s64>(difference, 0); + size += std::min<s64>(difference, 0); + if (offset >= SizeBytes() || size < 0) { + return; + } + u64* const untracked_words = Array<Type::Untracked>(); + u64* const state_words = Array<type>(); + const u64 offset_end = std::min(offset + size, SizeBytes()); + const u64 begin_page_index = offset / BYTES_PER_PAGE; + const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; + const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); + const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); + u64 page_index = begin_page_index % PAGES_PER_WORD; + u64 word_index = begin_word_index; + while (word_index < end_word_index) { + const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; + const u64 left_offset = + std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; + const u64 right_offset = page_index; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); + } + if constexpr (enable) { + state_words[word_index] |= bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] |= bits; + } + } else { + state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] &= ~bits; + } + } + page_index = 0; + ++word_index; + } + } + + /** + * Notify rasterizer about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the rasterizer + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages + */ + template <bool add_to_rasterizer> + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + while (changed_bits != 0) { + const int empty_bits = std::countr_zero(changed_bits); + addr += empty_bits * BYTES_PER_PAGE; + changed_bits >>= empty_bits; + + const u32 continuous_bits = std::countr_one(changed_bits); + const u64 size = continuous_bits * BYTES_PER_PAGE; + const VAddr begin_addr = addr; + addr += size; + changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; + rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); + } + } + + /** + * Loop over each page in the given range, turn off those bits and notify the rasterizer if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template <Type type, typename Func> + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + + const s64 difference = query_cpu_range - cpu_addr; + const u64 query_begin = std::max<s64>(difference, 0); + size += std::min<s64>(difference, 0); + if (query_begin >= SizeBytes() || size < 0) { + return; + } + u64* const untracked_words = Array<Type::Untracked>(); + u64* const state_words = Array<type>(); + const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); + u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; + u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); + + const auto modified = [](u64 word) { return word != 0; }; + const auto first_modified_word = std::find_if(words_begin, words_end, modified); + if (first_modified_word == words_end) { + // Exit early when the buffer is not modified + return; + } + const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); + + const u64 word_index_begin = std::distance(state_words, first_modified_word); + const u64 word_index_end = std::distance(state_words, last_modified_word); + + const unsigned local_page_begin = std::countr_zero(*first_modified_word); + const unsigned local_page_end = + static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); + const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; + const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; + const u64 query_page_begin = query_begin / BYTES_PER_PAGE; + const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); + const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); + const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); + const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; + const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; + + u64 page_begin = first_word_page_begin; + u64 current_base = 0; + u64 current_size = 0; + bool on_going = false; + for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { + const bool is_last_word = word_index + 1 == word_index_end; + const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; + const u64 right_offset = page_begin; + const u64 left_offset = PAGES_PER_WORD - page_end; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + + const u64 current_word = state_words[word_index] & bits; + state_words[word_index] &= ~bits; + + if constexpr (type == Type::CPU) { + const u64 current_bits = untracked_words[word_index] & bits; + untracked_words[word_index] &= ~bits; + NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); + } + // Exclude CPU modified pages when visiting GPU pages + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); + u64 page = page_begin; + page_begin = 0; + + while (page < page_end) { + const int empty_bits = std::countr_zero(word >> page); + if (on_going && empty_bits != 0) { + InvokeModifiedRange(func, current_size, current_base); + current_size = 0; + on_going = false; + } + page += empty_bits; + + const int continuous_bits = std::countr_one(word >> page); + if (!on_going && continuous_bits != 0) { + current_base = word_index * PAGES_PER_WORD + page; + on_going = true; + } + current_size += continuous_bits; + page += continuous_bits; + } + } + if (on_going && current_size > 0) { + InvokeModifiedRange(func, current_size, current_base); + } + } + + template <typename Func> + void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { + const u64 current_size_bytes = current_size * BYTES_PER_PAGE; + const u64 offset_begin = current_base * BYTES_PER_PAGE; + const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); + func(offset_begin, offset_end - offset_begin); + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array<Type::Untracked>(); + const u64* const state_words = Array<type>(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); + u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; + for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; + if (word == 0) { + continue; + } + const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); + const u64 local_page_end = page_end % PAGES_PER_WORD; + const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; + if (((word >> page_index) << page_index) << page_end_shift != 0) { + return true; + } + } + return false; + } + + /** + * Returns a begin end pair with the inclusive modified region + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array<Type::Untracked>(); + const u64* const state_words = Array<type>(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_base = offset / BYTES_PER_PAGE; + const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); + u64 begin = std::numeric_limits<u64>::max(); + u64 end = 0; + for (u64 word_index = word_begin; word_index < word_end; ++word_index) { + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; + if (word == 0) { + continue; + } + const u64 local_page_begin = std::countr_zero(word); + const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); + const u64 page_index = word_index * PAGES_PER_WORD; + const u64 page_begin = std::max(page_index + local_page_begin, page_base); + const u64 page_end = std::min(page_index + local_page_end, page_limit); + begin = std::min(begin, page_begin); + end = std::max(end, page_end); + } + static constexpr std::pair<u64, u64> EMPTY{0, 0}; + return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + + RasterizerInterface* rasterizer = nullptr; + VAddr cpu_addr = 0; + Words words; + BufferFlagBits flags{}; + int stream_score = 0; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h deleted file mode 100644 index e64170e66..000000000 --- a/src/video_core/buffer_cache/buffer_block.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <unordered_set> -#include <utility> - -#include "common/alignment.h" -#include "common/common_types.h" -#include "video_core/gpu.h" - -namespace VideoCommon { - -class BufferBlock { -public: - bool Overlaps(VAddr start, VAddr end) const { - return (cpu_addr < end) && (cpu_addr_end > start); - } - - bool IsInside(VAddr other_start, VAddr other_end) const { - return cpu_addr <= other_start && other_end <= cpu_addr_end; - } - - std::size_t Offset(VAddr in_addr) const { - return static_cast<std::size_t>(in_addr - cpu_addr); - } - - VAddr CpuAddr() const { - return cpu_addr; - } - - VAddr CpuAddrEnd() const { - return cpu_addr_end; - } - - void SetCpuAddr(VAddr new_addr) { - cpu_addr = new_addr; - cpu_addr_end = new_addr + size; - } - - std::size_t Size() const { - return size; - } - - u64 Epoch() const { - return epoch; - } - - void SetEpoch(u64 new_epoch) { - epoch = new_epoch; - } - -protected: - explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { - SetCpuAddr(cpu_addr_); - } - -private: - VAddr cpu_addr{}; - VAddr cpu_addr_end{}; - std::size_t size{}; - u64 epoch{}; -}; - -} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100644 index 000000000..ab32294c8 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -0,0 +1,13 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/microprofile.h" + +namespace VideoCommon { + +MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128)); + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b5dc68902..2a6844ab1 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -4,597 +4,1289 @@ #pragma once -#include <list> +#include <algorithm> +#include <array> +#include <deque> #include <memory> #include <mutex> +#include <span> #include <unordered_map> -#include <unordered_set> -#include <utility> #include <vector> #include <boost/container/small_vector.hpp> -#include <boost/icl/interval_set.hpp> -#include <boost/intrusive/set.hpp> -#include "common/alignment.h" -#include "common/assert.h" #include "common/common_types.h" -#include "common/logging/log.h" -#include "core/core.h" +#include "common/div_ceil.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" #include "core/memory.h" #include "core/settings.h" -#include "video_core/buffer_cache/buffer_block.h" -#include "video_core/buffer_cache/map_interval.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" namespace VideoCommon { -template <typename Buffer, typename BufferType, typename StreamBuffer> +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +template <typename P> class BufferCache { - using IntervalSet = boost::icl::interval_set<VAddr>; - using IntervalType = typename IntervalSet::interval_type; - using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 PAGE_BITS = 16; + static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; + + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; + + static constexpr BufferId NULL_BUFFER_ID{0}; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + + struct Empty {}; + + struct OverlapResult { + std::vector<BufferId> ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; - static constexpr u64 WRITE_PAGE_BIT = 11; - static constexpr u64 BLOCK_PAGE_BITS = 21; - static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; + }; -public: - struct BufferInfo { - BufferType handle; - u64 offset; - u64 address; + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, }; - BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool is_written = false, bool use_fast_cbuf = false) { - std::lock_guard lock{mutex}; +public: + static constexpr u32 SKIP_CACHE_SIZE = 4096; - auto& memory_manager = system.GPU().MemoryManager(); - const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); - if (!cpu_addr_opt) { - return GetEmptyBuffer(size); - } - const VAddr cpu_addr = *cpu_addr_opt; - - // Cache management is a big overhead, so only cache entries with a given size. - // TODO: Figure out which size is the best for given games. - constexpr std::size_t max_stream_size = 0x800; - if (use_fast_cbuf || size < max_stream_size) { - if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { - const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size); - if (use_fast_cbuf) { - u8* dest; - if (is_granular) { - dest = memory_manager.GetPointer(gpu_addr); - } else { - staging_buffer.resize(size); - dest = staging_buffer.data(); - memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); - } - return ConstBufferUpload(dest, size); - } - if (is_granular) { - u8* const host_ptr = memory_manager.GetPointer(gpu_addr); - return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { - std::memcpy(dest, host_ptr, size); - }); - } else { - return StreamBufferUpload( - size, alignment, [&memory_manager, gpu_addr, size](u8* dest) { - memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); - }); - } - } - } + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + Runtime& runtime_); - Buffer* const block = GetBlock(cpu_addr, size); - MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size); - if (!map) { - return GetEmptyBuffer(size); - } - if (is_written) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } - if (!map->is_written) { - map->is_written = true; - MarkRegionAsWritten(map->start, map->end - 1); - } - } + void TickFrame(); - return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()}; - } + void WriteMemory(VAddr cpu_addr, u64 size); - /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. - BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, - std::size_t alignment = 4) { - std::lock_guard lock{mutex}; - return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { - std::memcpy(dest, raw_pointer, size); - }); - } + void CachedWriteMemory(VAddr cpu_addr, u64 size); - /// Prepares the buffer cache for data uploading - /// @param max_size Maximum number of bytes that will be uploaded - /// @return True when a stream buffer invalidation was required, false otherwise - bool Map(std::size_t max_size) { - std::lock_guard lock{mutex}; + void DownloadMemory(VAddr cpu_addr, u64 size); - bool invalidated; - std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); - buffer_offset = buffer_offset_base; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - return invalidated; - } + void UpdateGraphicsBuffers(bool is_indexed); - /// Finishes the upload stream - void Unmap() { - std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); - } + void UpdateComputeBuffers(); - /// Function called at the end of each frame, inteded for deferred operations - void TickFrame() { - ++epoch; + void BindHostGeometryBuffers(bool is_indexed); - while (!pending_destruction.empty()) { - // Delay at least 4 frames before destruction. - // This is due to triple buffering happening on some drivers. - static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { - break; - } - pending_destruction.pop(); - } - } + void BindHostStageBuffers(size_t stage); - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void BindHostComputeBuffers(); - VectorMapInterval objects = GetMapsInRange(addr, size); - std::sort(objects.begin(), objects.end(), - [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); - for (MapInterval* object : objects) { - if (object->is_modified && object->is_registered) { - mutex.unlock(); - FlushMap(object); - mutex.lock(); - } - } - } + void SetEnabledUniformBuffers(size_t stage, u32 enabled); - bool MustFlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void SetEnabledComputeUniformBuffers(u32 enabled); - const VectorMapInterval objects = GetMapsInRange(addr, size); - return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { - return map->is_modified && map->is_registered; - }); - } + void UnbindGraphicsStorageBuffers(size_t stage); - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); - for (auto& object : GetMapsInRange(addr, size)) { - if (object->is_registered) { - Unregister(object); - } - } - } + void UnbindComputeStorageBuffers(); - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); - for (MapInterval* object : GetMapsInRange(addr, size)) { - if (object->is_memory_marked && object->is_registered) { - UnmarkMemory(object); - object->is_sync_pending = true; - marked_for_unregister.emplace_back(object); - } - } - } + void FlushCachedWrites(); - void SyncGuestHost() { - std::lock_guard lock{mutex}; + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - for (auto& object : marked_for_unregister) { - if (object->is_registered) { - object->is_sync_pending = false; - Unregister(object); - } + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + std::mutex mutex; + +private: + template <typename Func> + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); } - marked_for_unregister.clear(); } - void CommitAsyncFlushes() { - if (uncommitted_flushes) { - auto commit_list = std::make_shared<std::list<MapInterval*>>(); - for (MapInterval* map : *uncommitted_flushes) { - if (map->is_registered && map->is_modified) { - // TODO(Blinkhawk): Implement backend asynchronous flushing - // AsyncFlushMap(map) - commit_list->push_back(map); - } - } - if (!commit_list->empty()) { - committed_flushes.push_back(commit_list); - } else { - committed_flushes.emplace_back(); + template <typename Func> + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); + for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; } - } else { - committed_flushes.emplace_back(); + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); } - uncommitted_flushes.reset(); } - bool ShouldWaitAsyncFlushes() const { - return !committed_flushes.empty() && committed_flushes.front() != nullptr; + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::PAGE_MASK) == + ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); } - bool HasUncommittedFlushes() const { - return uncommitted_flushes != nullptr; - } + void BindHostIndexBuffer(); - void PopAsyncFlushes() { - if (committed_flushes.empty()) { - return; - } - auto& flush_list = committed_flushes.front(); - if (!flush_list) { - committed_flushes.pop_front(); - return; - } - for (MapInterval* map : *flush_list) { - if (map->is_registered) { - // TODO(Blinkhawk): Replace this for reading the asynchronous flush - FlushMap(map); - } - } - committed_flushes.pop_front(); - } + void BindHostVertexBuffers(); - virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; + void BindHostGraphicsUniformBuffers(size_t stage); -protected: - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - std::unique_ptr<StreamBuffer> stream_buffer) - : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {} + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - ~BufferCache() = default; + void BindHostGraphicsStorageBuffers(size_t stage); - virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; + void BindHostTransformFeedbackBuffers(); - virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { - return {}; - } + void BindHostComputeUniformBuffers(); - /// Register an object into the cache - MapInterval* Register(MapInterval new_map, bool inherit_written = false) { - const VAddr cpu_addr = new_map.start; - if (!cpu_addr) { - LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", - new_map.gpu_addr); - return nullptr; - } - const std::size_t size = new_map.end - new_map.start; - new_map.is_registered = true; - rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); - new_map.is_memory_marked = true; - if (inherit_written) { - MarkRegionAsWritten(new_map.start, new_map.end - 1); - new_map.is_written = true; - } - MapInterval* const storage = mapped_addresses_allocator.Allocate(); - *storage = new_map; - mapped_addresses.insert(*storage); - return storage; - } + void BindHostComputeStorageBuffers(); - void UnmarkMemory(MapInterval* map) { - if (!map->is_memory_marked) { - return; - } - const std::size_t size = map->end - map->start; - rasterizer.UpdatePagesCachedCount(map->start, size, -1); - map->is_memory_marked = false; - } - - /// Unregisters an object from the cache - void Unregister(MapInterval* map) { - UnmarkMemory(map); - map->is_registered = false; - if (map->is_sync_pending) { - map->is_sync_pending = false; - marked_for_unregister.remove(map); + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template <bool insert> + void ChangeRegister(BufferId buffer_id); + + void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span<BufferCopy> copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span<const BufferCopy> copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + + void DeleteBuffer(BufferId buffer_id); + + void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; + + [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + Core::Memory::Memory& cpu_memory; + Runtime& runtime; + + SlotVector<Buffer> slot_buffers; + DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; + std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; + std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; + std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; + + std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; + std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; + + std::array<u32, NUM_STAGES> enabled_uniform_buffers{}; + u32 enabled_compute_uniform_buffers = 0; + + std::array<u32, NUM_STAGES> enabled_storage_buffers{}; + std::array<u32, NUM_STAGES> written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{}; + + bool has_deleted_buffers = false; + + std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> + dirty_uniform_buffers{}; + + std::vector<BufferId> cached_write_buffer_ids; + + // TODO: This data structure is not optimal and it should be reworked + std::vector<BufferId> uncommitted_downloads; + std::deque<std::vector<BufferId>> committed_downloads; + + size_t immediate_buffer_capacity = 0; + std::unique_ptr<u8[]> immediate_buffer_alloc; + + std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; +}; + +template <class P> +BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + Runtime& runtime_) + : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, + gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { + // Ensure the first slot is used for the null buffer + void(slot_buffers.insert(runtime, NullBufferParams{})); +} + +template <class P> +void BufferCache<P>::TickFrame() { + delayed_destruction_ring.Tick(); +} + +template <class P> +void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + buffer.MarkRegionAsCpuModified(cpu_addr, size); + }); +} + +template <class P> +void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + if (!buffer.HasCachedWrites()) { + cached_write_buffer_ids.push_back(buffer_id); } - if (map->is_written) { - UnmarkRegionAsWritten(map->start, map->end - 1); + buffer.CachedCpuWrite(cpu_addr, size); + }); +} + +template <class P> +void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + boost::container::small_vector<BufferCopy, 1> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; } - const auto it = mapped_addresses.find(*map); - ASSERT(it != mapped_addresses.end()); - mapped_addresses.erase(it); - mapped_addresses_allocator.Release(map); - } - -private: - MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { - const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); - if (overlaps.empty()) { - auto& memory_manager = system.GPU().MemoryManager(); - const VAddr cpu_addr_end = cpu_addr + size; - if (memory_manager.IsGranularRange(gpu_addr, size)) { - u8* host_ptr = memory_manager.GetPointer(gpu_addr); - block->Upload(block->Offset(cpu_addr), size, host_ptr); - } else { - staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); + for (BufferCopy& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dst_offset += download_staging.offset; } - return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); - } - - const VAddr cpu_addr_end = cpu_addr + size; - if (overlaps.size() == 1) { - MapInterval* const current_map = overlaps[0]; - if (current_map->IsInside(cpu_addr, cpu_addr_end)) { - return current_map; + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* copy_mapped_memory = mapped_memory + dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); } } - VAddr new_start = cpu_addr; - VAddr new_end = cpu_addr_end; - bool write_inheritance = false; - bool modified_inheritance = false; - // Calculate new buffer parameters - for (MapInterval* overlap : overlaps) { - new_start = std::min(overlap->start, new_start); - new_end = std::max(overlap->end, new_end); - write_inheritance |= overlap->is_written; - modified_inheritance |= overlap->is_modified; + }); +} + +template <class P> +void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + uniform_buffers[stage][index] = NULL_BINDING; + return; + } + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = BufferId{}, + }; + uniform_buffers[stage][index] = binding; +} + +template <class P> +void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateGraphicsBuffers(is_indexed); + } while (has_deleted_buffers); +} + +template <class P> +void BufferCache<P>::UpdateComputeBuffers() { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateComputeBuffers(); + } while (has_deleted_buffers); +} + +template <class P> +void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + if (is_indexed) { + BindHostIndexBuffer(); + } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + const auto& regs = maxwell3d.regs; + if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { + runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); } - GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; - for (auto& overlap : overlaps) { - Unregister(overlap); + } + BindHostVertexBuffers(); + BindHostTransformFeedbackBuffers(); +} + +template <class P> +void BufferCache<P>::BindHostStageBuffers(size_t stage) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostGraphicsUniformBuffers(stage); + BindHostGraphicsStorageBuffers(stage); +} + +template <class P> +void BufferCache<P>::BindHostComputeBuffers() { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostComputeUniformBuffers(); + BindHostComputeStorageBuffers(); +} + +template <class P> +void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + if (enabled_uniform_buffers[stage] != enabled) { + dirty_uniform_buffers[stage] = ~u32{0}; } - UpdateBlock(block, new_start, new_end, overlaps); - - const MapInterval new_map{new_start, new_end, new_gpu_addr}; - MapInterval* const map = Register(new_map, write_inheritance); - if (!map) { - return nullptr; + } + enabled_uniform_buffers[stage] = enabled; +} + +template <class P> +void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) { + enabled_compute_uniform_buffers = enabled; +} + +template <class P> +void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) { + enabled_storage_buffers[stage] = 0; + written_storage_buffers[stage] = 0; +} + +template <class P> +void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, + u32 cbuf_offset, bool is_written) { + enabled_storage_buffers[stage] |= 1U << ssbo_index; + written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& cbufs = maxwell3d.state.shader_stages[stage]; + const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; + storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template <class P> +void BufferCache<P>::UnbindComputeStorageBuffers() { + enabled_compute_storage_buffers = 0; + written_compute_storage_buffers = 0; +} + +template <class P> +void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written) { + enabled_compute_storage_buffers |= 1U << ssbo_index; + written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& launch_desc = kepler_compute.launch_description; + ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); + + const auto& cbufs = launch_desc.const_buffer_config; + const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; + compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template <class P> +void BufferCache<P>::FlushCachedWrites() { + for (const BufferId buffer_id : cached_write_buffer_ids) { + slot_buffers[buffer_id].FlushCachedWrites(); + } + cached_write_buffer_ids.clear(); +} + +template <class P> +bool BufferCache<P>::HasUncommittedFlushes() const noexcept { + return !uncommitted_downloads.empty(); +} + +template <class P> +bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { + return !committed_downloads.empty() && !committed_downloads.front().empty(); +} + +template <class P> +void BufferCache<P>::CommitAsyncFlushes() { + // This is intentionally passing the value by copy + committed_downloads.push_front(uncommitted_downloads); + uncommitted_downloads.clear(); +} + +template <class P> +void BufferCache<P>::PopAsyncFlushes() { + if (committed_downloads.empty()) { + return; + } + auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); + const std::span<const BufferId> download_ids = committed_downloads.back(); + if (download_ids.empty()) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + for (const BufferId buffer_id : download_ids) { + slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { + downloads.push_back({ + BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }, + buffer_id, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + } + if (downloads.empty()) { + return; + } + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); } - if (modified_inheritance) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } + runtime.Finish(); + for (const auto [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); } - return map; } - - void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { - const IntervalType base_interval{start, end}; - IntervalSet interval_set{}; - interval_set.add(base_interval); - for (auto& overlap : overlaps) { - const IntervalType subtract{overlap->start, overlap->end}; - interval_set.subtract(subtract); +} + +template <class P> +bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const BufferId image_id = page_table[page]; + if (!image_id) { + ++page; + continue; } - for (auto& interval : interval_set) { - const std::size_t size = interval.upper() - interval.lower(); - if (size == 0) { - continue; - } - staging_buffer.resize(size); - system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); + Buffer& buffer = slot_buffers[image_id]; + if (buffer.IsRegionGpuModified(addr, size)) { + return true; } + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); } - - VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { - VectorMapInterval result; - if (size == 0) { - return result; + return false; +} + +template <class P> +void BufferCache<P>::BindHostIndexBuffer() { + Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + const u32 offset = buffer.Offset(index_buffer.cpu_addr); + const u32 size = index_buffer.size; + SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); + if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + runtime.BindIndexBuffer(buffer, offset, size); + } else { + runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, + maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, + buffer, offset, size); + } +} + +template <class P> +void BufferCache<P>::BindHostVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + const Binding& binding = vertex_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); + if (!flags[Dirty::VertexBuffer0 + index]) { + continue; } + flags[Dirty::VertexBuffer0 + index] = false; + + const u32 stride = maxwell3d.regs.vertex_array[index].stride; + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); + } +} - const VAddr addr_end = addr + size; - auto it = mapped_addresses.lower_bound(addr); - if (it != mapped_addresses.begin()) { - --it; +template <class P> +void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) { + u32 dirty = ~0U; + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty = std::exchange(dirty_uniform_buffers[stage], 0); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + const bool needs_bind = ((dirty >> index) & 1) != 0; + BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + ++binding_index; } - while (it != mapped_addresses.end() && it->start < addr_end) { - if (it->Overlaps(addr, addr_end)) { - result.push_back(&*it); + }); +} + +template <class P> +void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, + bool needs_bind) { + const Binding& binding = uniform_buffers[stage][index]; + const VAddr cpu_addr = binding.cpu_addr; + const u32 size = binding.size; + Buffer& buffer = slot_buffers[binding.buffer_id]; + if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { + if constexpr (IS_OPENGL) { + if (runtime.HasFastBufferSubData()) { + // Fast path for Nvidia + if (!HasFastUniformBufferBound(stage, binding_index)) { + // We only have to bind when the currently bound buffer is not the fast version + runtime.BindFastUniformBuffer(stage, binding_index, size); + } + const auto span = ImmediateBufferWithData(cpu_addr, size); + runtime.PushFastUniformBuffer(stage, binding_index, span); + return; } - ++it; } - return result; - } + fast_bound_uniform_buffers[stage] |= 1U << binding_index; - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - return ++modified_ticks; + // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan + const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); + return; } - - void FlushMap(MapInterval* map) { - const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); - ASSERT_OR_EXECUTE(it != blocks.end(), return;); - - std::shared_ptr<Buffer> block = it->second; - - const std::size_t size = map->end - map->start; - staging_buffer.resize(size); - block->Download(block->Offset(map->start), size, staging_buffer.data()); - system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size); - map->MarkAsModified(false, 0); + // Classic cached path + SynchronizeBuffer(buffer, cpu_addr, size); + if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { + // Skip binding if it's not needed and if the bound buffer is not the fast version + // This exists to avoid instances where the fast buffer is bound and a GPU write happens + return; } + fast_bound_uniform_buffers[stage] &= ~(1U << binding_index); - template <typename Callable> - BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { - AlignBuffer(alignment); - const std::size_t uploaded_offset = buffer_offset; - callable(buffer_ptr); - - buffer_ptr += size; - buffer_offset += size; - return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()}; + const u32 offset = buffer.Offset(cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); + } else { + runtime.BindUniformBuffer(buffer, offset, size); } +} + +template <class P> +void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { + u32 binding_index = 0; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + const Binding& binding = storage_buffers[stage][index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} - void AlignBuffer(std::size_t alignment) { - // Align the offset, not the mapped pointer - const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); - buffer_ptr += offset_aligned - buffer_offset; - buffer_offset = offset_aligned; +template <class P> +void BufferCache<P>::BindHostTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; } + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + const Binding& binding = transform_feedback_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindTransformFeedbackBuffer(index, buffer, offset, size); + } +} - std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { - const std::size_t old_size = buffer->Size(); - const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; - const VAddr cpu_addr = buffer->CpuAddr(); - std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); - new_buffer->CopyFrom(*buffer, 0, 0, old_size); - QueueDestruction(std::move(buffer)); - - const VAddr cpu_addr_end = cpu_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); +template <class P> +void BufferCache<P>::BindHostComputeUniformBuffers() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + // Mark all uniform buffers as dirty + dirty_uniform_buffers.fill(~u32{0}); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + const Binding& binding = compute_uniform_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); + ++binding_index; + } else { + runtime.BindUniformBuffer(buffer, offset, size); } + }); +} + +template <class P> +void BufferCache<P>::BindHostComputeStorageBuffers() { + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + const Binding& binding = compute_storage_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} - return new_buffer; +template <class P> +void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) { + if (is_indexed) { + UpdateIndexBuffer(); } + UpdateVertexBuffers(); + UpdateTransformFeedbackBuffers(); + for (size_t stage = 0; stage < NUM_STAGES; ++stage) { + UpdateUniformBuffers(stage); + UpdateStorageBuffers(stage); + } +} + +template <class P> +void BufferCache<P>::DoUpdateComputeBuffers() { + UpdateComputeUniformBuffers(); + UpdateComputeStorageBuffers(); +} + +template <class P> +void BufferCache<P>::UpdateIndexBuffer() { + // We have to check for the dirty flags and index count + // The index count is currently changed without updating the dirty flags + const auto& index_array = maxwell3d.regs.index_array; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { + return; + } + flags[Dirty::IndexBuffer] = false; + last_index_count = index_array.count; + + const GPUVAddr gpu_addr_begin = index_array.StartAddress(); + const GPUVAddr gpu_addr_end = index_array.EndAddress(); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); + const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); + const u32 size = std::min(address_size, draw_size); + if (size == 0 || !cpu_addr) { + index_buffer = NULL_BINDING; + return; + } + index_buffer = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} - std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, - std::shared_ptr<Buffer> second) { - const std::size_t size_1 = first->Size(); - const std::size_t size_2 = second->Size(); - const VAddr first_addr = first->CpuAddr(); - const VAddr second_addr = second->CpuAddr(); - const VAddr new_addr = std::min(first_addr, second_addr); - const std::size_t new_size = size_1 + size_2; - - std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); - new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); - new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); - QueueDestruction(std::move(first)); - QueueDestruction(std::move(second)); +template <class P> +void BufferCache<P>::UpdateVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { + return; + } + flags[Dirty::VertexBuffers] = false; - const VAddr cpu_addr_end = new_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); - } - return new_buffer; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + UpdateVertexBuffer(index); } +} - Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { - std::shared_ptr<Buffer> found; +template <class P> +void BufferCache<P>::UpdateVertexBuffer(u32 index) { + if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { + return; + } + const auto& array = maxwell3d.regs.vertex_array[index]; + const auto& limit = maxwell3d.regs.vertex_array_limit[index]; + const GPUVAddr gpu_addr_begin = array.StartAddress(); + const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); + const u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { + vertex_buffers[index] = NULL_BINDING; + return; + } + vertex_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} + +template <class P> +void BufferCache<P>::UpdateUniformBuffers(size_t stage) { + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + Binding& binding = uniform_buffers[stage][index]; + if (binding.buffer_id) { + // Already updated + return; + } + // Mark as dirty + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers[stage] |= 1U << index; + } + // Resolve buffer + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template <class P> +void BufferCache<P>::UpdateStorageBuffers(size_t stage) { + const u32 written_mask = written_storage_buffers[stage]; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + // Resolve buffer + Binding& binding = storage_buffers[stage][index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark buffer as written if needed + if (((written_mask >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} - const VAddr cpu_addr_end = cpu_addr + size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - auto it = blocks.find(page_start); - if (it == blocks.end()) { - if (found) { - found = EnlargeBlock(found); - continue; - } - const VAddr start_addr = page_start << BLOCK_PAGE_BITS; - found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); - blocks.insert_or_assign(page_start, found); - continue; - } - if (!found) { - found = it->second; - continue; - } - if (found != it->second) { - found = MergeBlocks(std::move(found), it->second); +template <class P> +void BufferCache<P>::UpdateTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; + } + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + UpdateTransformFeedbackBuffer(index); + } +} + +template <class P> +void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) { + const auto& binding = maxwell3d.regs.tfb_bindings[index]; + const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; + const u32 size = binding.buffer_size; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { + transform_feedback_buffers[index] = NULL_BINDING; + return; + } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); + transform_feedback_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = buffer_id, + }; + MarkWrittenBuffer(buffer_id, *cpu_addr, size); +} + +template <class P> +void BufferCache<P>::UpdateComputeUniformBuffers() { + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + Binding& binding = compute_uniform_buffers[index]; + binding = NULL_BINDING; + const auto& launch_desc = kepler_compute.launch_description; + if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { + const auto& cbuf = launch_desc.const_buffer_config[index]; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); + if (cpu_addr) { + binding.cpu_addr = *cpu_addr; + binding.size = cbuf.size; } } - return found.get(); + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template <class P> +void BufferCache<P>::UpdateComputeStorageBuffers() { + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + // Resolve buffer + Binding& binding = compute_storage_buffers[index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark as written if needed + if (((written_compute_storage_buffers >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} + +template <class P> +void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsGpuModified(cpu_addr, size); + + const bool is_accuracy_high = Settings::IsGPULevelHigh(); + const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + if (!is_accuracy_high || !is_async) { + return; + } + if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { + // Already inserted + return; } + uncommitted_downloads.push_back(buffer_id); +} - void MarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { - ++it->second; - } +template <class P> +BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { + if (cpu_addr == 0) { + return NULL_BUFFER_ID; + } + const u64 page = cpu_addr >> PAGE_BITS; + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + return CreateBuffer(cpu_addr, size); + } + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(cpu_addr, size)) { + return buffer_id; + } + return CreateBuffer(cpu_addr, size); +} + +template <class P> +typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr, + u32 wanted_size) { + static constexpr int STREAM_LEAP_THRESHOLD = 16; + std::vector<BufferId> overlap_ids; + VAddr begin = cpu_addr; + VAddr end = cpu_addr + wanted_size; + int stream_score = 0; + bool has_stream_leap = false; + for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { + const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; + if (!overlap_id) { + continue; + } + Buffer& overlap = slot_buffers[overlap_id]; + if (overlap.IsPicked()) { + continue; + } + overlap_ids.push_back(overlap_id); + overlap.Pick(); + const VAddr overlap_cpu_addr = overlap.CpuAddr(); + if (overlap_cpu_addr < begin) { + cpu_addr = begin = overlap_cpu_addr; + } + end = std::max(end, overlap_cpu_addr + overlap.SizeBytes()); + + stream_score += overlap.StreamScore(); + if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) { + // When this memory region has been joined a bunch of times, we assume it's being used + // as a stream buffer. Increase the size to skip constantly recreating buffers. + has_stream_leap = true; + end += PAGE_SIZE * 256; } } - - void UnmarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - auto it = written_pages.find(page_start); - if (it != written_pages.end()) { - if (it->second > 1) { - --it->second; - } else { - written_pages.erase(it); - } - } + return OverlapResult{ + .ids = std::move(overlap_ids), + .begin = begin, + .end = end, + .has_stream_leap = has_stream_leap, + }; +} + +template <class P> +void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, + bool accumulate_stream_score) { + Buffer& new_buffer = slot_buffers[new_buffer_id]; + Buffer& overlap = slot_buffers[overlap_id]; + if (accumulate_stream_score) { + new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); + } + std::vector<BufferCopy> copies; + const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); + overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = begin, + .dst_offset = dst_base_offset + begin, + .size = range_size, + }); + new_buffer.UnmarkRegionAsCpuModified(begin, range_size); + new_buffer.MarkRegionAsGpuModified(begin, range_size); + }); + if (!copies.empty()) { + runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); + } + ReplaceBufferDownloads(overlap_id, new_buffer_id); + DeleteBuffer(overlap_id); +} + +template <class P> +BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); + const u32 size = static_cast<u32>(overlap.end - overlap.begin); + const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); + for (const BufferId overlap_id : overlap.ids) { + JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); + } + Register(new_buffer_id); + return new_buffer_id; +} + +template <class P> +void BufferCache<P>::Register(BufferId buffer_id) { + ChangeRegister<true>(buffer_id); +} + +template <class P> +void BufferCache<P>::Unregister(BufferId buffer_id) { + ChangeRegister<false>(buffer_id); +} + +template <class P> +template <bool insert> +void BufferCache<P>::ChangeRegister(BufferId buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr_begin = buffer.CpuAddr(); + const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const u64 page_begin = cpu_addr_begin / PAGE_SIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); + for (u64 page = page_begin; page != page_end; ++page) { + if constexpr (insert) { + page_table[page] = buffer_id; + } else { + page_table[page] = BufferId{}; } } +} - bool IsRegionWritten(VAddr start, VAddr end) const { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (written_pages.count(page_start) > 0) { - return true; +template <class P> +void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { + if (buffer.CpuAddr() == 0) { + return; + } + SynchronizeBufferImpl(buffer, cpu_addr, size); +} + +template <class P> +void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector<BufferCopy, 4> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = range_offset, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + const std::span<BufferCopy> copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); +} + +template <class P> +void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span<BufferCopy> copies) { + if constexpr (USE_MEMORY_MAPS) { + MappedUploadMemory(buffer, total_size_bytes, copies); + } else { + ImmediateUploadMemory(buffer, largest_copy, copies); + } +} + +template <class P> +void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span<const BufferCopy> copies) { + std::span<u8> immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span<const u8> upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - return false; + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - - void QueueDestruction(std::shared_ptr<Buffer> buffer) { - buffer->SetEpoch(epoch); - pending_destruction.push(std::move(buffer)); +} + +template <class P> +void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, + std::span<BufferCopy> copies) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span<u8> staging_pointer = upload_staging.mapped_span; + for (BufferCopy& copy : copies) { + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); + + // Apply the staging offset + copy.src_offset += upload_staging.offset; } - - void MarkForAsyncFlush(MapInterval* map) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); +} + +template <class P> +void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { + const auto scalar_replace = [buffer_id](Binding& binding) { + if (binding.buffer_id == buffer_id) { + binding.buffer_id = BufferId{}; + } + }; + const auto replace = [scalar_replace](std::span<Binding> bindings) { + std::ranges::for_each(bindings, scalar_replace); + }; + scalar_replace(index_buffer); + replace(vertex_buffers); + std::ranges::for_each(uniform_buffers, replace); + std::ranges::for_each(storage_buffers, replace); + replace(transform_feedback_buffers); + replace(compute_uniform_buffers); + replace(compute_storage_buffers); + std::erase(cached_write_buffer_ids, buffer_id); + + // Mark the whole buffer as CPU written to stop tracking CPU writes + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + + Unregister(buffer_id); + delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + + NotifyBufferDeletion(); +} + +template <class P> +void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { + const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) { + std::ranges::replace(buffers, old_buffer_id, new_buffer_id); + if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { + buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); } - uncommitted_flushes->insert(map); + }; + replace(uncommitted_downloads); + std::ranges::for_each(committed_downloads, replace); +} + +template <class P> +void BufferCache<P>::NotifyBufferDeletion() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers.fill(~u32{0}); } + auto& flags = maxwell3d.dirty.flags; + flags[Dirty::IndexBuffer] = true; + flags[Dirty::VertexBuffers] = true; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + flags[Dirty::VertexBuffer0 + index] = true; + } + has_deleted_buffers = true; +} + +template <class P> +typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const { + const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr); + const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr || size == 0) { + return NULL_BINDING; + } + // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range. + // It exists due to some games like Astral Chain operate out of bounds. + // Binding the whole map range would be technically correct, but games have large maps that make + // this approach unaffordable for now. + static constexpr u32 arbitrary_extra_bytes = 0xc000; + const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr)); + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end), + .buffer_id = BufferId{}, + }; + return binding; +} + +template <class P> +std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) { + u8* const base_pointer = cpu_memory.GetPointer(cpu_addr); + if (IsRangeGranular(cpu_addr, size) || + base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) { + return std::span(base_pointer, size); + } else { + const std::span<u8> span = ImmediateBuffer(size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); + return span; + } +} - VideoCore::RasterizerInterface& rasterizer; - Core::System& system; - - std::unique_ptr<StreamBuffer> stream_buffer; - BufferType stream_buffer_handle; - - u8* buffer_ptr = nullptr; - u64 buffer_offset = 0; - u64 buffer_offset_base = 0; - - MapIntervalAllocator mapped_addresses_allocator; - boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> - mapped_addresses; - - std::unordered_map<u64, u32> written_pages; - std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; - - std::queue<std::shared_ptr<Buffer>> pending_destruction; - u64 epoch = 0; - u64 modified_ticks = 0; - - std::vector<u8> staging_buffer; - - std::list<MapInterval*> marked_for_unregister; - - std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; - std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes; - - std::recursive_mutex mutex; -}; +template <class P> +std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) { + if (wanted_capacity > immediate_buffer_capacity) { + immediate_buffer_capacity = wanted_capacity; + immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity); + } + return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity); +} + +template <class P> +bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept { + if constexpr (IS_OPENGL) { + return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0; + } else { + // Only OpenGL has fast uniform buffers + return false; + } +} } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp deleted file mode 100644 index 62587e18a..000000000 --- a/src/video_core/buffer_cache/map_interval.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <array> -#include <cstddef> -#include <memory> - -#include "video_core/buffer_cache/map_interval.h" - -namespace VideoCommon { - -MapIntervalAllocator::MapIntervalAllocator() { - FillFreeList(first_chunk); -} - -MapIntervalAllocator::~MapIntervalAllocator() = default; - -void MapIntervalAllocator::AllocateNewChunk() { - *new_chunk = std::make_unique<Chunk>(); - FillFreeList(**new_chunk); - new_chunk = &(*new_chunk)->next; -} - -void MapIntervalAllocator::FillFreeList(Chunk& chunk) { - const std::size_t old_size = free_list.size(); - free_list.resize(old_size + chunk.data.size()); - std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size, - [](MapInterval& interval) { return &interval; }); -} - -} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h deleted file mode 100644 index fe0bcd1d8..000000000 --- a/src/video_core/buffer_cache/map_interval.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <cstddef> -#include <memory> -#include <vector> - -#include <boost/intrusive/set_hook.hpp> - -#include "common/common_types.h" -#include "video_core/gpu.h" - -namespace VideoCommon { - -struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> { - MapInterval() = default; - - /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {} - - explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept - : start{start_}, end{end_}, gpu_addr{gpu_addr_} {} - - bool IsInside(VAddr other_start, VAddr other_end) const noexcept { - return start <= other_start && other_end <= end; - } - - bool Overlaps(VAddr other_start, VAddr other_end) const noexcept { - return start < other_end && other_start < end; - } - - void MarkAsModified(bool is_modified_, u64 ticks_) noexcept { - is_modified = is_modified_; - ticks = ticks_; - } - - boost::intrusive::set_member_hook<> member_hook_; - VAddr start = 0; - VAddr end = 0; - GPUVAddr gpu_addr = 0; - u64 ticks = 0; - bool is_written = false; - bool is_modified = false; - bool is_registered = false; - bool is_memory_marked = false; - bool is_sync_pending = false; -}; - -struct MapIntervalCompare { - constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept { - return lhs.start < rhs.start; - } -}; - -class MapIntervalAllocator { -public: - MapIntervalAllocator(); - ~MapIntervalAllocator(); - - MapInterval* Allocate() { - if (free_list.empty()) { - AllocateNewChunk(); - } - MapInterval* const interval = free_list.back(); - free_list.pop_back(); - return interval; - } - - void Release(MapInterval* interval) { - free_list.push_back(interval); - } - -private: - struct Chunk { - std::unique_ptr<Chunk> next; - std::array<MapInterval, 0x8000> data; - }; - - void AllocateNewChunk(); - - void FillFreeList(Chunk& chunk); - - std::vector<MapInterval*> free_list; - std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; - - Chunk first_chunk; -}; - -} // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp new file mode 100644 index 000000000..33b3c060b --- /dev/null +++ b/src/video_core/cdma_pusher.cpp @@ -0,0 +1,170 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <bit> +#include "command_classes/host1x.h" +#include "command_classes/nvdec.h" +#include "command_classes/vic.h" +#include "video_core/cdma_pusher.h" +#include "video_core/command_classes/nvdec_common.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra { +CDmaPusher::CDmaPusher(GPU& gpu_) + : gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)), + vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)), + host1x_processor(std::make_unique<Host1x>(gpu)), + sync_manager(std::make_unique<SyncptIncrManager>(gpu)) {} + +CDmaPusher::~CDmaPusher() = default; + +void CDmaPusher::Push(ChCommandHeaderList&& entries) { + cdma_queue.push(std::move(entries)); +} + +void CDmaPusher::DispatchCalls() { + while (!cdma_queue.empty()) { + Step(); + } +} + +void CDmaPusher::Step() { + const auto entries{cdma_queue.front()}; + cdma_queue.pop(); + + std::vector<u32> values(entries.size()); + std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32)); + + for (const u32 value : values) { + if (mask != 0) { + const auto lbs = static_cast<u32>(std::countr_zero(mask)); + mask &= ~(1U << lbs); + ExecuteCommand(static_cast<u32>(offset + lbs), value); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(static_cast<u32>(offset), value); + if (incrementing) { + ++offset; + } + continue; + } + const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value & 0x3f; + offset = (value >> 16) & 0xfff; + current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value & 0xffff; + offset = (value >> 16) & 0xfff; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value & 0xffff; + offset = (value >> 16) & 0xfff; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value & 0xfff; + offset = (value >> 16) & 0xfff; + ExecuteCommand(static_cast<u32>(offset), data); + break; + } + default: + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode)); + break; + } + } +} + +void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { + switch (current_class) { + case ChClassId::NvDec: + ThiStateWrite(nvdec_thi_state, state_offset, {data}); + switch (static_cast<ThiMethod>(state_offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id)); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", + static_cast<u32>(nvdec_thi_state.method_0)); + nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0), + {data}); + break; + default: + break; + } + break; + case ChClassId::GraphicsVic: + ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data}); + switch (static_cast<ThiMethod>(state_offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id)); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", + static_cast<u32>(vic_thi_state.method_0), data); + vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data}); + break; + default: + break; + } + break; + case ChClassId::Host1x: + // This device is mainly for syncpoint synchronization + LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); + host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data}); + break; + default: + UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); + break; + } +} + +void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, + const std::vector<u32>& arguments) { + u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset; + std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size()); +} + +} // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h new file mode 100644 index 000000000..e5f212c1a --- /dev/null +++ b/src/video_core/cdma_pusher.h @@ -0,0 +1,136 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include <queue> + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/command_classes/sync_manager.h" + +namespace Tegra { + +class GPU; +class Nvdec; +class Vic; +class Host1x; + +enum class ChSubmissionMode : u32 { + SetClass = 0, + Incrementing = 1, + NonIncrementing = 2, + Mask = 3, + Immediate = 4, + Restart = 5, + Gather = 6, +}; + +enum class ChClassId : u32 { + NoClass = 0x0, + Host1x = 0x1, + VideoEncodeMpeg = 0x20, + VideoEncodeNvEnc = 0x21, + VideoStreamingVi = 0x30, + VideoStreamingIsp = 0x32, + VideoStreamingIspB = 0x34, + VideoStreamingViI2c = 0x36, + GraphicsVic = 0x5d, + Graphics3D = 0x60, + GraphicsGpu = 0x61, + Tsec = 0xe0, + TsecB = 0xe1, + NvJpg = 0xc0, + NvDec = 0xf0 +}; + +enum class ChMethod : u32 { + Empty = 0, + SetMethod = 0x10, + SetData = 0x11, +}; + +union ChCommandHeader { + u32 raw; + BitField<0, 16, u32> value; + BitField<16, 12, ChMethod> method_offset; + BitField<28, 4, ChSubmissionMode> submission_mode; +}; +static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); + +struct ChCommand { + ChClassId class_id{}; + int method_offset{}; + std::vector<u32> arguments; +}; + +using ChCommandHeaderList = std::vector<ChCommandHeader>; +using ChCommandList = std::vector<ChCommand>; + +struct ThiRegisters { + u32_le increment_syncpt{}; + INSERT_PADDING_WORDS(1); + u32_le increment_syncpt_error{}; + u32_le ctx_switch_incremement_syncpt{}; + INSERT_PADDING_WORDS(4); + u32_le ctx_switch{}; + INSERT_PADDING_WORDS(1); + u32_le ctx_syncpt_eof{}; + INSERT_PADDING_WORDS(5); + u32_le method_0{}; + u32_le method_1{}; + INSERT_PADDING_WORDS(12); + u32_le int_status{}; + u32_le int_mask{}; +}; + +enum class ThiMethod : u32 { + IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32), + SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32), + SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32), +}; + +class CDmaPusher { +public: + explicit CDmaPusher(GPU& gpu_); + ~CDmaPusher(); + + /// Push NVDEC command buffer entries into queue + void Push(ChCommandHeaderList&& entries); + + /// Process queued command buffer entries + void DispatchCalls(); + + /// Process one queue element + void Step(); + + /// Invoke command class devices to execute the command based on the current state + void ExecuteCommand(u32 state_offset, u32 data); + +private: + /// Write arguments value to the ThiRegisters member at the specified offset + void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments); + + GPU& gpu; + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + std::unique_ptr<Tegra::Vic> vic_processor; + std::unique_ptr<Tegra::Host1x> host1x_processor; + std::unique_ptr<SyncptIncrManager> sync_manager; + ChClassId current_class{}; + ThiRegisters vic_thi_state{}; + ThiRegisters nvdec_thi_state{}; + + s32 count{}; + s32 offset{}; + u32 mask{}; + bool incrementing{}; + + // Queue of command lists to be processed + std::queue<ChCommandHeaderList> cdma_queue; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp new file mode 100644 index 000000000..39bc923a5 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -0,0 +1,129 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <fstream> +#include <vector> +#include "common/assert.h" +#include "video_core/command_classes/codecs/codec.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +extern "C" { +#include <libavutil/opt.h> +} + +namespace Tegra { + +void AVFrameDeleter(AVFrame* ptr) { + av_frame_unref(ptr); + av_free(ptr); +} + +Codec::Codec(GPU& gpu_) + : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)), + vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {} + +Codec::~Codec() { + if (!initialized) { + return; + } + // Free libav memory + AVFrame* av_frame{nullptr}; + avcodec_send_packet(av_codec_ctx, nullptr); + av_frame = av_frame_alloc(); + avcodec_receive_frame(av_codec_ctx, av_frame); + avcodec_flush_buffers(av_codec_ctx); + + av_frame_unref(av_frame); + av_free(av_frame); + avcodec_close(av_codec_ctx); +} + +void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { + LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec); + current_codec = codec; +} + +void Codec::StateWrite(u32 offset, u64 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64); + std::memcpy(state_offset, &arguments, sizeof(u64)); +} + +void Codec::Decode() { + bool is_first_frame = false; + + if (!initialized) { + if (current_codec == NvdecCommon::VideoCodec::H264) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9); + } else { + LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec); + return; + } + + av_codec_ctx = avcodec_alloc_context3(av_codec); + av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); + + // TODO(ameerj): libavcodec gpu hw acceleration + + const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); + if (av_error < 0) { + LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); + avcodec_close(av_codec_ctx); + return; + } + initialized = true; + is_first_frame = true; + } + bool vp9_hidden_frame = false; + + AVPacket packet{}; + av_init_packet(&packet); + std::vector<u8> frame_data; + + if (current_codec == NvdecCommon::VideoCodec::H264) { + frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + frame_data = vp9_decoder->ComposeFrameHeader(state); + vp9_hidden_frame = vp9_decoder->WasFrameHidden(); + } + + packet.data = frame_data.data(); + packet.size = static_cast<int>(frame_data.size()); + + avcodec_send_packet(av_codec_ctx, &packet); + + if (!vp9_hidden_frame) { + // Only receive/store visible frames + AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; + avcodec_receive_frame(av_codec_ctx, frame.get()); + av_frames.push(std::move(frame)); + // Limit queue to 10 frames. Workaround for ZLA decode and queue spam + if (av_frames.size() > 10) { + av_frames.pop(); + } + } +} + +AVFramePtr Codec::GetCurrentFrame() { + // Sometimes VIC will request more frames than have been decoded. + // in this case, return a nullptr and don't overwrite previous frame data + if (av_frames.empty()) { + return AVFramePtr{nullptr, AVFrameDeleter}; + } + + AVFramePtr frame = std::move(av_frames.front()); + av_frames.pop(); + return frame; +} + +NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { + return current_codec; +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h new file mode 100644 index 000000000..8a2a6c360 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.h @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <queue> +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include <libavcodec/avcodec.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace Tegra { +class GPU; +struct VicRegisters; + +void AVFrameDeleter(AVFrame* ptr); +using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>; + +namespace Decoder { +class H264; +class VP9; +} // namespace Decoder + +class Codec { +public: + explicit Codec(GPU& gpu); + ~Codec(); + + /// Sets NVDEC video stream codec + void SetTargetCodec(NvdecCommon::VideoCodec codec); + + /// Populate NvdecRegisters state with argument value at the provided offset + void StateWrite(u32 offset, u64 arguments); + + /// Call decoders to construct headers, decode AVFrame with ffmpeg + void Decode(); + + /// Returns next decoded frame + [[nodiscard]] AVFramePtr GetCurrentFrame(); + + /// Returns the value of current_codec + [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const; + +private: + bool initialized{}; + NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; + + AVCodec* av_codec{nullptr}; + AVCodecContext* av_codec_ctx{nullptr}; + + GPU& gpu; + std::unique_ptr<Decoder::H264> h264_decoder; + std::unique_ptr<Decoder::VP9> vp9_decoder; + + NvdecCommon::NvdecRegisters state{}; + std::queue<AVFramePtr> av_frames{}; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp new file mode 100644 index 000000000..fea6aed98 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -0,0 +1,293 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <array> +#include <bit> +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// ZigZag LUTs from libavcodec. +constexpr std::array<u8, 64> zig_zag_direct{ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, + 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +constexpr std::array<u8, 16> zig_zag_scan{ + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; +} // Anonymous namespace + +H264::H264(GPU& gpu_) : gpu(gpu_) {} + +H264::~H264() = default; + +const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { + H264DecoderContext context{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); + + const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff); + if (!is_first_frame && frame_number != 0) { + frame.resize(context.frame_data_size); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + } else { + /// Encode header + H264BitWriter writer{}; + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(7, 5); + writer.WriteU(100, 8); + writer.WriteU(0, 8); + writer.WriteU(31, 8); + writer.WriteUe(0); + const auto chroma_format_idc = + static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3); + writer.WriteUe(chroma_format_idc); + if (chroma_format_idc == 3) { + writer.WriteBit(false); + } + + writer.WriteUe(0); + writer.WriteUe(0); + writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(false); // Scaling matrix present flag + + const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3); + writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf)); + writer.WriteUe(order_cnt_type); + if (order_cnt_type == 0) { + writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt); + } else if (order_cnt_type == 1) { + writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + + writer.WriteSe(0); + writer.WriteSe(0); + writer.WriteUe(0); + } + + const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units / + (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + + writer.WriteUe(16); + writer.WriteBit(false); + writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(pic_height - 1); + writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + + if (!context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0); + } + + writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0); + writer.WriteBit(false); // Frame cropping flag + writer.WriteBit(false); // VUI parameter present flag + + writer.End(); + + // H264 PPS + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(8, 5); + + writer.WriteUe(0); + writer.WriteUe(0); + + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(false); + writer.WriteUe(0); + writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0); + writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2); + s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f); + pic_init_qp = (pic_init_qp << 26) >> 26; + writer.WriteSe(pic_init_qp); + writer.WriteSe(0); + s32 chroma_qp_index_offset = + static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f); + chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset); + writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0); + writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0); + writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0); + writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + + writer.WriteBit(true); + + for (s32 index = 0; index < 6; index++) { + writer.WriteBit(true); + const auto matrix_x4 = + std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end()); + writer.WriteScalingList(matrix_x4, index * 16, 16); + } + + if (context.h264_parameter_set.transform_8x8_mode_flag) { + for (s32 index = 0; index < 2; index++) { + writer.WriteBit(true); + const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(), + context.scaling_matrix_8.end()); + + writer.WriteScalingList(matrix_x8, index * 64, 64); + } + } + + s32 chroma_qp_index_offset2 = + static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f); + chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset2); + + writer.End(); + + const auto& encoded_header = writer.GetByteArray(); + frame.resize(encoded_header.size() + context.frame_data_size); + std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, + frame.data() + encoded_header.size(), + context.frame_data_size); + } + + return frame; +} + +H264BitWriter::H264BitWriter() = default; + +H264BitWriter::~H264BitWriter() = default; + +void H264BitWriter::WriteU(s32 value, s32 value_sz) { + WriteBits(value, value_sz); +} + +void H264BitWriter::WriteSe(s32 value) { + WriteExpGolombCodedInt(value); +} + +void H264BitWriter::WriteUe(u32 value) { + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::End() { + WriteBit(true); + Flush(); +} + +void H264BitWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) { + std::vector<u8> scan(count); + if (count == 16) { + std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); + } else { + std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); + } + u8 last_scale = 8; + + for (s32 index = 0; index < count; index++) { + const u8 value = list[start + scan[index]]; + const s32 delta_scale = static_cast<s32>(value - last_scale); + + WriteSe(delta_scale); + + last_scale = value; + } +} + +std::vector<u8>& H264BitWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& H264BitWriter::GetByteArray() const { + return byte_array; +} + +void H264BitWriter::WriteBits(s32 value, s32 bit_count) { + s32 value_pos = 0; + + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free_bits = GetFreeBufferBits(); + + if (copy_size > free_bits) { + copy_size = free_bits; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void H264BitWriter::WriteExpGolombCodedInt(s32 value) { + const s32 sign = value <= 0 ? 0 : 1; + if (value < 0) { + value = -value; + } + value = (value << 1) - sign; + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { + const s32 size = 32 - std::countl_zero(value + 1); + WriteBits(1, size); + + value -= (1U << (size - 1)) - 1; + WriteBits(static_cast<s32>(value), size - 1); +} + +s32 H264BitWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void H264BitWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + + buffer = 0; + buffer_pos = 0; +} +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h new file mode 100644 index 000000000..0f3a1d9f3 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.h @@ -0,0 +1,118 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class H264BitWriter { +public: + H264BitWriter(); + ~H264BitWriter(); + + /// The following Write methods are based on clause 9.1 in the H.264 specification. + /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax + void WriteU(s32 value, s32 value_sz); + void WriteSe(s32 value); + void WriteUe(u32 value); + + /// Finalize the bitstream + void End(); + + /// append a bit to the stream, equivalent value to the state parameter + void WriteBit(bool state); + + /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification + /// Writes the scaling matrices of the sream + void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count); + + /// Return the bitstream as a vector. + [[nodiscard]] std::vector<u8>& GetByteArray(); + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + void WriteBits(s32 value, s32 bit_count); + void WriteExpGolombCodedInt(s32 value); + void WriteExpGolombCodedUInt(u32 value); + [[nodiscard]] s32 GetFreeBufferBits(); + void Flush(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class H264 { +public: + explicit H264(GPU& gpu); + ~H264(); + + /// Compose the H264 header of the frame for FFmpeg decoding + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader( + const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false); + +private: + struct H264ParameterSet { + u32 log2_max_pic_order_cnt{}; + u32 delta_pic_order_always_zero_flag{}; + u32 frame_mbs_only_flag{}; + u32 pic_width_in_mbs{}; + u32 pic_height_in_map_units{}; + INSERT_PADDING_WORDS(1); + u32 entropy_coding_mode_flag{}; + u32 bottom_field_pic_order_flag{}; + u32 num_refidx_l0_default_active{}; + u32 num_refidx_l1_default_active{}; + u32 deblocking_filter_control_flag{}; + u32 redundant_pic_count_flag{}; + u32 transform_8x8_mode_flag{}; + INSERT_PADDING_WORDS(9); + u64 flags{}; + u32 frame_number{}; + u32 frame_number2{}; + }; + static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size"); + + struct H264DecoderContext { + INSERT_PADDING_BYTES(0x48); + u32 frame_data_size{}; + INSERT_PADDING_BYTES(0xc); + H264ParameterSet h264_parameter_set{}; + INSERT_PADDING_BYTES(0x100); + std::array<u8, 0x60> scaling_matrix_4; + std::array<u8, 0x80> scaling_matrix_8; + }; + static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size"); + + std::vector<u8> frame; + GPU& gpu; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp new file mode 100644 index 000000000..59e586695 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -0,0 +1,989 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> // for std::memcpy +#include <numeric> +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// Default compressed header probabilities once frame context resets +constexpr Vp9EntropyProbs default_probs{ + .y_mode_prob{ + 65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78, + 173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, + }, + .partition_prob{ + 199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, + 174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0, + 177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0, + 222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0, + }, + .coef_probs{ + 195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32, + 40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47, + 69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31, + 102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29, + 156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31, + 191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33, + 41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26, + 59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34, + 114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34, + 149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42, + 214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28, + 89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33, + 108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38, + 124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37, + 141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47, + 229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57, + 83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39, + 99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47, + 127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52, + 157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61, + 125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29, + 31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45, + 23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24, + 29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23, + 57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29, + 202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57, + 33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21, + 30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20, + 32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22, + 57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32, + 212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25, + 71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42, + 64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30, + 65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30, + 65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34, + 225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84, + 75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28, + 73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29, + 62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32, + 61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41, + 7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18, + 43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27, + 23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23, + 18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23, + 15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20, + 19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52, + 35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28, + 28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20, + 20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20, + 13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24, + 211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28, + 71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29, + 57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27, + 47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26, + 26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28, + 233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97, + 85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30, + 65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29, + 40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26, + 16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31, + 17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16, + 50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20, + 40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17, + 30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18, + 10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17, + 36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61, + 85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30, + 72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27, + 55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22, + 15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23, + 181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23, + 62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19, + 55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16, + 49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17, + 13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23, + 197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137, + 111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91, + 104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102, + 84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79, + 43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6, + }, + .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, + .inter_mode_prob{ + 2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94, + 66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0, + }, + .intra_inter_prob{9, 102, 187, 225}, + .comp_inter_prob{9, 102, 187, 225, 0}, + .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, + .comp_ref_prob{50, 126, 123, 221, 226}, + .tx_32x32_prob{3, 136, 37, 5, 52, 13}, + .tx_16x16_prob{20, 152, 15, 101}, + .tx_8x8_prob{100, 66}, + .skip_probs{192, 128, 64}, + .joints{32, 64, 96}, + .sign{128, 128}, + .classes{ + 224, 144, 192, 168, 192, 176, 192, 198, 198, 245, + 216, 128, 176, 160, 176, 176, 192, 198, 198, 208, + }, + .class_0{216, 208}, + .prob_bits{ + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + }, + .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, + .fr{64, 96, 64, 64, 96, 64}, + .class_0_hp{160, 160}, + .high_precision{128, 128}, +}; + +constexpr std::array<s32, 256> norm_lut{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +constexpr std::array<s32, 254> map_lut{ + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 19, +}; + +// 6.2.14 Tile size calculation + +[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 min_log2 = 0; + + while ((64 << min_log2) < sb64_cols) { + min_log2++; + } + + return min_log2; +} + +[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 max_log2 = 1; + + while ((sb64_cols >> max_log2) >= 4) { + max_log2++; + } + + return max_log2 - 1; +} + +// Recenters probability. Based on section 6.3.6 of VP9 Specification +[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { + if (new_prob > old_prob * 2) { + return new_prob; + } + + if (new_prob >= old_prob) { + return (new_prob - old_prob) * 2; + } + + return (old_prob - new_prob) * 2 - 1; +} + +// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { + new_prob--; + old_prob--; + + std::size_t index{}; + + if (old_prob * 2 <= 0xff) { + index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); + } else { + index = static_cast<std::size_t>( + std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); + } + + return map_lut[index]; +} +} // Anonymous namespace + +VP9::VP9(GPU& gpu_) : gpu{gpu_} {} + +VP9::~VP9() = default; + +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + + writer.Write(update, diff_update_probability); + + if (update) { + WriteProbabilityDelta(writer, new_prob, old_prob); + } +} +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { + WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); + } +} + +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { + WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); + WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); + WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); + } +} + +void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const int delta = RemapProbability(new_prob, old_prob); + + EncodeTermSubExp(writer, delta); +} + +void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { + if (WriteLessThan(writer, value, 16)) { + writer.Write(value, 4); + } else if (WriteLessThan(writer, value, 32)) { + writer.Write(value - 16, 4); + } else if (WriteLessThan(writer, value, 64)) { + writer.Write(value - 32, 5); + } else { + value -= 64; + + constexpr s32 size = 8; + + const s32 mask = (1 << size) - 191; + + const s32 delta = value - mask; + + if (delta < 0) { + writer.Write(value, size - 1); + } else { + writer.Write(delta / 2 + mask, size - 1); + writer.Write(delta & 1, 1); + } + } +} + +bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { + const bool is_lt = value < test; + writer.Write(!is_lt); + return is_lt; +} + +void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 1728>& new_prob, + const std::array<u8, 1728>& old_prob) { + constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3; + + const auto needs_update = [&](u32 base_index) { + return !std::equal(new_prob.begin() + base_index, + new_prob.begin() + base_index + block_bytes, + old_prob.begin() + base_index); + }; + + for (u32 block_index = 0; block_index < 4; block_index++) { + const u32 base_index = block_index * block_bytes; + const bool update = needs_update(base_index); + writer.Write(update); + + if (update) { + u32 index = base_index; + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 6; k++) { + for (s32 l = 0; l < 6; l++) { + if (k != 0 || l < 3) { + WriteProbabilityUpdate(writer, new_prob[index + 0], + old_prob[index + 0]); + WriteProbabilityUpdate(writer, new_prob[index + 1], + old_prob[index + 1]); + WriteProbabilityUpdate(writer, new_prob[index + 2], + old_prob[index + 2]); + } + index += 3; + } + } + } + } + } + if (block_index == static_cast<u32>(tx_mode)) { + break; + } + } +} + +void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + writer.Write(update, diff_update_probability); + + if (update) { + writer.Write(new_prob >> 1, 7); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { + PictureInfo picture_info{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = picture_info.Convert(); + + InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + + // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following + // order: last, golden, altref, current. It may be worthwhile to track the updates done here + // to avoid buffering frame data needed for reference frame updating in the header composition. + std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64)); + + return vp9_info; +} + +void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { + EntropyProbs entropy{}; + gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + entropy.Convert(dst); +} + +Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) { + Vp9FrameContainer current_frame{}; + { + gpu.SyncGuestHost(); + current_frame.info = GetVp9PictureInfo(state); + current_frame.bit_stream.resize(current_frame.info.bitstream_size); + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), + current_frame.info.bitstream_size); + } + // Buffer two frames, saving the last show frame info + if (!next_next_frame.bit_stream.empty()) { + Vp9FrameContainer temp{ + .info = current_frame.info, + .bit_stream = std::move(current_frame.bit_stream), + }; + next_next_frame.info.show_frame = current_frame.info.last_frame_shown; + current_frame.info = next_next_frame.info; + current_frame.bit_stream = std::move(next_next_frame.bit_stream); + next_next_frame = std::move(temp); + + if (!next_frame.bit_stream.empty()) { + Vp9FrameContainer temp2{ + .info = current_frame.info, + .bit_stream = std::move(current_frame.bit_stream), + }; + next_frame.info.show_frame = current_frame.info.last_frame_shown; + current_frame.info = next_frame.info; + current_frame.bit_stream = std::move(next_frame.bit_stream); + next_frame = std::move(temp2); + } else { + next_frame.info = current_frame.info; + next_frame.bit_stream = std::move(current_frame.bit_stream); + } + } else { + next_next_frame.info = current_frame.info; + next_next_frame.bit_stream = std::move(current_frame.bit_stream); + } + return current_frame; +} + +std::vector<u8> VP9::ComposeCompressedHeader() { + VpxRangeEncoder writer{}; + const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame; + if (!current_frame_info.lossless) { + if (static_cast<u32>(current_frame_info.transform_mode) >= 3) { + writer.Write(3, 2); + writer.Write(current_frame_info.transform_mode == 4); + } else { + writer.Write(current_frame_info.transform_mode, 2); + } + } + + if (current_frame_info.transform_mode == 4) { + // tx_mode_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, + prev_frame_probs.tx_8x8_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, + prev_frame_probs.tx_16x16_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, + prev_frame_probs.tx_32x32_prob); + if (update_probs) { + prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; + prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; + prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; + } + } + // read_coef_probs() in the spec + WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, + current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); + // read_skip_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, + prev_frame_probs.skip_probs); + + if (update_probs) { + prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; + prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; + } + + if (!current_frame_info.intra_only) { + // read_inter_probs() in the spec + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, + prev_frame_probs.inter_mode_prob); + + if (current_frame_info.interp_filter == 4) { + // read_interp_filter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, + prev_frame_probs.switchable_interp_prob); + if (update_probs) { + prev_frame_probs.switchable_interp_prob = + current_frame_info.entropy.switchable_interp_prob; + } + } + + // read_is_inter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, + prev_frame_probs.intra_inter_prob); + + // frame_reference_mode() in the spec + if ((current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[2] & 1) || + (current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[3] & 1)) { + if (current_frame_info.reference_mode >= 1) { + writer.Write(1, 1); + writer.Write(current_frame_info.reference_mode == 2); + } else { + writer.Write(0, 1); + } + } + + // frame_reference_mode_probs() in the spec + if (current_frame_info.reference_mode == 2) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, + prev_frame_probs.comp_inter_prob); + if (update_probs) { + prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; + } + } + + if (current_frame_info.reference_mode != 1) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, + prev_frame_probs.single_ref_prob); + if (update_probs) { + prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; + } + } + + if (current_frame_info.reference_mode != 0) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, + prev_frame_probs.comp_ref_prob); + if (update_probs) { + prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; + } + } + + // read_y_mode_probs + for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); + ++index) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], + prev_frame_probs.y_mode_prob[index]); + } + + // read_partition_probs + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, + prev_frame_probs.partition_prob); + + // mv_probs + for (s32 i = 0; i < 3; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], + prev_frame_probs.joints[i]); + } + if (update_probs) { + prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; + prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; + prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; + prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; + prev_frame_probs.joints = current_frame_info.entropy.joints; + } + + for (s32 i = 0; i < 2; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], + prev_frame_probs.sign[i]); + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], + prev_frame_probs.classes[index]); + } + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], + prev_frame_probs.class_0[i]); + + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], + prev_frame_probs.prob_bits[index]); + } + } + + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 3; k++) { + const int index = i * 2 * 3 + j * 3 + k; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], + prev_frame_probs.class_0_fr[index]); + } + } + + for (s32 j = 0; j < 3; j++) { + const int index = i * 3 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], + prev_frame_probs.fr[index]); + } + } + + if (current_frame_info.allow_high_precision_mv) { + for (s32 index = 0; index < 2; index++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], + prev_frame_probs.class_0_hp[index]); + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], + prev_frame_probs.high_precision[index]); + } + } + + // save previous probs + if (update_probs) { + prev_frame_probs.sign = current_frame_info.entropy.sign; + prev_frame_probs.classes = current_frame_info.entropy.classes; + prev_frame_probs.class_0 = current_frame_info.entropy.class_0; + prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; + prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; + prev_frame_probs.fr = current_frame_info.entropy.fr; + prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; + prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; + } + } + writer.End(); + return writer.GetBuffer(); +} + +VpxBitStreamWriter VP9::ComposeUncompressedHeader() { + VpxBitStreamWriter uncomp_writer{}; + + uncomp_writer.WriteU(2, 2); // Frame marker. + uncomp_writer.WriteU(0, 2); // Profile. + uncomp_writer.WriteBit(false); // Show existing frame. + uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame? + uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame? + uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience + + if (current_frame_info.is_key_frame) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(0, 3); // Color space. + uncomp_writer.WriteU(0, 1); // Color range. + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + + // Reset context + prev_frame_probs = default_probs; + swap_next_golden = false; + loop_filter_ref_deltas.fill(0); + loop_filter_mode_deltas.fill(0); + + // allow frames offsets to stabilize before checking for golden frames + grace_period = 4; + + // On key frames, all frame slots are set to the current frame, + // so the value of the selected slot doesn't really matter. + frame_ctxs.fill({current_frame_number, false, default_probs}); + + // intra only, meaning the frame can be recreated with no other references + current_frame_info.intra_only = true; + + } else { + + if (!current_frame_info.show_frame) { + uncomp_writer.WriteBit(current_frame_info.intra_only); + if (!current_frame_info.last_frame_was_key) { + swap_next_golden = !swap_next_golden; + } + } else { + current_frame_info.intra_only = false; + } + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteU(0, 2); // Reset frame context. + } + + // Last, Golden, Altref frames + std::array<s32, 3> ref_frame_index{0, 1, 2}; + + // Set when next frame is hidden + // altref and golden references are swapped + if (swap_next_golden) { + ref_frame_index = std::array<s32, 3>{0, 2, 1}; + } + + // update Last Frame + u64 refresh_frame_flags = 1; + + // golden frame may refresh, determined if the next golden frame offset is changed + bool golden_refresh = false; + if (grace_period <= 0) { + for (s32 index = 1; index < 3; ++index) { + if (current_frame_info.frame_offsets[index] != + next_frame.info.frame_offsets[index]) { + current_frame_info.refresh_frame[index] = true; + golden_refresh = true; + grace_period = 3; + } + } + } + + if (current_frame_info.show_frame && + (!next_frame.info.show_frame || next_frame.info.is_key_frame)) { + // Update golden frame + refresh_frame_flags = swap_next_golden ? 2 : 4; + } + + if (!current_frame_info.show_frame) { + // Update altref + refresh_frame_flags = swap_next_golden ? 2 : 4; + } else if (golden_refresh) { + refresh_frame_flags = 3; + } + + if (current_frame_info.intra_only) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + } else { + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + + for (s32 index = 1; index < 4; index++) { + uncomp_writer.WriteU(ref_frame_index[index - 1], 3); + uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); + } + + uncomp_writer.WriteBit(true); // Frame size with refs. + uncomp_writer.WriteBit(false); // Render and frame size different. + uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); + uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); + + if (current_frame_info.interp_filter != 4) { + uncomp_writer.WriteU(current_frame_info.interp_filter, 2); + } + } + } + + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? + uncomp_writer.WriteBit(true); // Frame parallel decoding mode. + } + + int frame_ctx_idx = 0; + if (!current_frame_info.show_frame) { + frame_ctx_idx = 1; + } + + uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. + prev_frame_probs = + frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header + frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy}; + + uncomp_writer.WriteU(current_frame_info.first_level, 6); + uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); + uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); + + if (current_frame_info.mode_ref_delta_enabled) { + // check if ref deltas are different, update accordingly + std::array<bool, 4> update_loop_filter_ref_deltas; + std::array<bool, 2> update_loop_filter_mode_deltas; + + bool loop_filter_delta_update = false; + + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + const s8 old_deltas = loop_filter_ref_deltas[index]; + const s8 new_deltas = current_frame_info.ref_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_ref_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + const s8 old_deltas = loop_filter_mode_deltas[index]; + const s8 new_deltas = current_frame_info.mode_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_mode_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + uncomp_writer.WriteBit(loop_filter_delta_update); + + if (loop_filter_delta_update) { + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); + + if (update_loop_filter_ref_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); + } + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); + + if (update_loop_filter_mode_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); + } + } + // save new deltas + loop_filter_ref_deltas = current_frame_info.ref_deltas; + loop_filter_mode_deltas = current_frame_info.mode_deltas; + } + } + + uncomp_writer.WriteU(current_frame_info.base_q_index, 8); + + uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + + uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + + const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); + const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); + + const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; + const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; + + // If it's less than the maximum, we need to add an extra 0 on the bitstream + // to indicate that it should stop reading. + if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { + uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); + } else { + uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); + } + + const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; + + uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); + + if (tile_rows_log2_is_nonzero) { + uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); + } + + return uncomp_writer; +} + +const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) { + std::vector<u8> bitstream; + { + Vp9FrameContainer curr_frame = GetCurrentFrame(state); + current_frame_info = curr_frame.info; + bitstream = std::move(curr_frame.bit_stream); + } + + // The uncompressed header routine sets PrevProb parameters needed for the compressed header + auto uncomp_writer = ComposeUncompressedHeader(); + std::vector<u8> compressed_header = ComposeCompressedHeader(); + + uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16); + uncomp_writer.Flush(); + std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray(); + + // Write headers and frame to buffer + frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(), + compressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(), + bitstream.data(), bitstream.size()); + + // keep track of frame number + current_frame_number++; + grace_period--; + + // don't display hidden frames + hidden = !current_frame_info.show_frame; + return frame; +} + +VpxRangeEncoder::VpxRangeEncoder() { + Write(false); +} + +VpxRangeEncoder::~VpxRangeEncoder() = default; + +void VpxRangeEncoder::Write(s32 value, s32 value_size) { + for (s32 bit = value_size - 1; bit >= 0; bit--) { + Write(((value >> bit) & 1) != 0); + } +} + +void VpxRangeEncoder::Write(bool bit) { + Write(bit, half_probability); +} + +void VpxRangeEncoder::Write(bool bit, s32 probability) { + u32 local_range = range; + const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8); + local_range = split; + + if (bit) { + low_value += split; + local_range = range - split; + } + + s32 shift = norm_lut[local_range]; + local_range <<= shift; + count += shift; + + if (count >= 0) { + const s32 offset = shift - count; + + if (((low_value << (offset - 1)) >> 31) != 0) { + const s32 current_pos = static_cast<s32>(base_stream.GetPosition()); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + while (PeekByte() == 0xff) { + base_stream.WriteByte(0); + + base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); + } + base_stream.WriteByte(static_cast<u8>((PeekByte() + 1))); + base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); + } + base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset)))); + + low_value <<= offset; + shift = count; + low_value &= 0xffffff; + count -= 8; + } + + low_value <<= shift; + range = local_range; +} + +void VpxRangeEncoder::End() { + for (std::size_t index = 0; index < 32; ++index) { + Write(false); + } +} + +u8 VpxRangeEncoder::PeekByte() { + const u8 value = base_stream.ReadByte(); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + + return value; +} + +VpxBitStreamWriter::VpxBitStreamWriter() = default; + +VpxBitStreamWriter::~VpxBitStreamWriter() = default; + +void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { + WriteBits(value, value_size); +} + +void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { + const bool sign = value < 0; + if (sign) { + value = -value; + } + + WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1); +} + +void VpxBitStreamWriter::WriteDeltaQ(u32 value) { + const bool delta_coded = value != 0; + WriteBit(delta_coded); + + if (delta_coded) { + WriteBits(value, 4); + } +} + +void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { + s32 value_pos = 0; + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free = GetFreeBufferBits(); + + if (copy_size > free) { + copy_size = free; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void VpxBitStreamWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +s32 VpxBitStreamWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void VpxBitStreamWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + buffer = 0; + buffer_pos = 0; +} + +std::vector<u8>& VpxBitStreamWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const { + return byte_array; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h new file mode 100644 index 000000000..8396c8105 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.h @@ -0,0 +1,197 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <vector> + +#include "common/common_types.h" +#include "common/stream.h" +#include "video_core/command_classes/codecs/vp9_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +enum class FrameType { KeyFrame = 0, InterFrame = 1 }; +namespace Decoder { + +/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the +/// VP9 header bitstreams. + +class VpxRangeEncoder { +public: + VpxRangeEncoder(); + ~VpxRangeEncoder(); + + VpxRangeEncoder(const VpxRangeEncoder&) = delete; + VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; + + VpxRangeEncoder(VpxRangeEncoder&&) = default; + VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; + + /// Writes the rightmost value_size bits from value into the stream + void Write(s32 value, s32 value_size); + + /// Writes a single bit with half probability + void Write(bool bit); + + /// Writes a bit to the base_stream encoded with probability + void Write(bool bit, s32 probability); + + /// Signal the end of the bitstream + void End(); + + [[nodiscard]] std::vector<u8>& GetBuffer() { + return base_stream.GetBuffer(); + } + + [[nodiscard]] const std::vector<u8>& GetBuffer() const { + return base_stream.GetBuffer(); + } + +private: + u8 PeekByte(); + Common::Stream base_stream{}; + u32 low_value{}; + u32 range{0xff}; + s32 count{-24}; + s32 half_probability{128}; +}; + +class VpxBitStreamWriter { +public: + VpxBitStreamWriter(); + ~VpxBitStreamWriter(); + + VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; + VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; + + VpxBitStreamWriter(VpxBitStreamWriter&&) = default; + VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; + + /// Write an unsigned integer value + void WriteU(u32 value, u32 value_size); + + /// Write a signed integer value + void WriteS(s32 value, u32 value_size); + + /// Based on 6.2.10 of VP9 Spec, writes a delta coded value + void WriteDeltaQ(u32 value); + + /// Write a single bit. + void WriteBit(bool state); + + /// Pushes current buffer into buffer_array, resets buffer + void Flush(); + + /// Returns byte_array + [[nodiscard]] std::vector<u8>& GetByteArray(); + + /// Returns const byte_array + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + /// Write bit_count bits from value into buffer + void WriteBits(u32 value, u32 bit_count); + + /// Gets next available position in buffer, invokes Flush() if buffer is full + s32 GetFreeBufferBits(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class VP9 { +public: + explicit VP9(GPU& gpu_); + ~VP9(); + + VP9(const VP9&) = delete; + VP9& operator=(const VP9&) = delete; + + VP9(VP9&&) = default; + VP9& operator=(VP9&&) = delete; + + /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec + /// documentation + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader( + const NvdecCommon::NvdecRegisters& state); + + /// Returns true if the most recent frame was a hidden frame. + [[nodiscard]] bool WasFrameHidden() const { + return hidden; + } + +private: + /// Generates compressed header probability updates in the bitstream writer + template <typename T, std::size_t N> + void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Generates compressed header probability updates in the bitstream writer + /// If probs are not equal, WriteProbabilityDelta is invoked + void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Generates compressed header probability deltas in the bitstream writer + void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Inverse of 6.3.4 Decode term subexp + void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); + + /// Writes if the value is less than the test value + bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); + + /// Writes probability updates for the Coef probabilities + void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 1728>& new_prob, + const std::array<u8, 1728>& old_prob); + + /// Write probabilities for 4-byte aligned structures + template <typename T, std::size_t N> + void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Write motion vector probability updates. 6.3.17 in the spec + void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Returns VP9 information from NVDEC provided offset and size + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); + + /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct + void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); + + /// Returns frame to be decoded after buffering + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); + + /// Use NVDEC providied information to compose the headers for the current frame + [[nodiscard]] std::vector<u8> ComposeCompressedHeader(); + [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); + + GPU& gpu; + std::vector<u8> frame; + + std::array<s8, 4> loop_filter_ref_deltas{}; + std::array<s8, 2> loop_filter_mode_deltas{}; + + bool hidden = false; + s64 current_frame_number = -2; // since we buffer 2 frames + s32 grace_period = 6; // frame offsets need to stabilize + std::array<FrameContexts, 4> frame_ctxs{}; + Vp9FrameContainer next_frame{}; + Vp9FrameContainer next_next_frame{}; + bool swap_next_golden{}; + + Vp9PictureInfo current_frame_info{}; + Vp9EntropyProbs prev_frame_probs{}; + + s32 diff_update_probability = 252; + s32 frame_sync_code = 0x498342; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h new file mode 100644 index 000000000..139501a1c --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -0,0 +1,302 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstring> +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; + +namespace Decoder { +struct Vp9FrameDimensions { + s16 width{}; + s16 height{}; + s16 luma_pitch{}; + s16 chroma_pitch{}; +}; +static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); + +enum FrameFlags : u32 { + IsKeyFrame = 1 << 0, + LastFrameIsKeyFrame = 1 << 1, + FrameSizeChanged = 1 << 2, + ErrorResilientMode = 1 << 3, + LastShowFrame = 1 << 4, + IntraOnly = 1 << 5, +}; + +enum class TxSize { + Tx4x4 = 0, // 4x4 transform + Tx8x8 = 1, // 8x8 transform + Tx16x16 = 2, // 16x16 transform + Tx32x32 = 3, // 32x32 transform + TxSizes = 4 +}; + +enum class TxMode { + Only4X4 = 0, // Only 4x4 transform used + Allow8X8 = 1, // Allow block transform size up to 8x8 + Allow16X16 = 2, // Allow block transform size up to 16x16 + Allow32X32 = 3, // Allow block transform size up to 32x32 + TxModeSelect = 4, // Transform specified for each block + TxModes = 5 +}; + +struct Segmentation { + u8 enabled{}; + u8 update_map{}; + u8 temporal_update{}; + u8 abs_delta{}; + std::array<u32, 8> feature_mask{}; + std::array<std::array<s16, 4>, 8> feature_data{}; +}; +static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); + +struct LoopFilter { + u8 mode_ref_delta_enabled{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; +}; +static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); + +struct Vp9EntropyProbs { + std::array<u8, 36> y_mode_prob{}; + std::array<u8, 64> partition_prob{}; + std::array<u8, 1728> coef_probs{}; + std::array<u8, 8> switchable_interp_prob{}; + std::array<u8, 28> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 10> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + std::array<u8, 6> tx_32x32_prob{}; + std::array<u8, 4> tx_16x16_prob{}; + std::array<u8, 2> tx_8x8_prob{}; + std::array<u8, 3> skip_probs{}; + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<u8, 20> classes{}; + std::array<u8, 2> class_0{}; + std::array<u8, 20> prob_bits{}; + std::array<u8, 12> class_0_fr{}; + std::array<u8, 6> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; +}; +static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size"); + +struct Vp9PictureInfo { + bool is_key_frame{}; + bool intra_only{}; + bool last_frame_was_key{}; + bool frame_size_changed{}; + bool error_resilient_mode{}; + bool last_frame_shown{}; + bool show_frame{}; + std::array<s8, 4> ref_frame_sign_bias{}; + s32 base_q_index{}; + s32 y_dc_delta_q{}; + s32 uv_dc_delta_q{}; + s32 uv_ac_delta_q{}; + bool lossless{}; + s32 transform_mode{}; + bool allow_high_precision_mv{}; + s32 interp_filter{}; + s32 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + s32 log2_tile_cols{}; + s32 log2_tile_rows{}; + bool segment_enabled{}; + bool segment_map_update{}; + bool segment_map_temporal_update{}; + s32 segment_abs_delta{}; + std::array<u32, 8> segment_feature_enable{}; + std::array<std::array<s16, 4>, 8> segment_feature_data{}; + bool mode_ref_delta_enabled{}; + bool use_prev_in_find_mv_refs{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; + Vp9EntropyProbs entropy{}; + Vp9FrameDimensions frame_size{}; + u8 first_level{}; + u8 sharpness_level{}; + u32 bitstream_size{}; + std::array<u64, 4> frame_offsets{}; + std::array<bool, 4> refresh_frame{}; +}; + +struct Vp9FrameContainer { + Vp9PictureInfo info{}; + std::vector<u8> bit_stream; +}; + +struct PictureInfo { + INSERT_PADDING_WORDS(12); + u32 bitstream_size{}; + INSERT_PADDING_WORDS(5); + Vp9FrameDimensions last_frame_size{}; + Vp9FrameDimensions golden_frame_size{}; + Vp9FrameDimensions alt_frame_size{}; + Vp9FrameDimensions current_frame_size{}; + u32 vp9_flags{}; + std::array<s8, 4> ref_frame_sign_bias{}; + u8 first_level{}; + u8 sharpness_level{}; + u8 base_q_index{}; + u8 y_dc_delta_q{}; + u8 uv_ac_delta_q{}; + u8 uv_dc_delta_q{}; + u8 lossless{}; + u8 tx_mode{}; + u8 allow_high_precision_mv{}; + u8 interp_filter{}; + u8 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + u8 log2_tile_cols{}; + u8 log2_tile_rows{}; + Segmentation segmentation{}; + LoopFilter loop_filter{}; + INSERT_PADDING_BYTES(5); + u32 surface_params{}; + INSERT_PADDING_WORDS(3); + + [[nodiscard]] Vp9PictureInfo Convert() const { + return { + .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, + .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, + .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, + .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0, + .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0, + .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0, + .ref_frame_sign_bias = ref_frame_sign_bias, + .base_q_index = base_q_index, + .y_dc_delta_q = y_dc_delta_q, + .uv_dc_delta_q = uv_dc_delta_q, + .uv_ac_delta_q = uv_ac_delta_q, + .lossless = lossless != 0, + .transform_mode = tx_mode, + .allow_high_precision_mv = allow_high_precision_mv != 0, + .interp_filter = interp_filter, + .reference_mode = reference_mode, + .comp_fixed_ref = comp_fixed_ref, + .comp_var_ref = comp_var_ref, + .log2_tile_cols = log2_tile_cols, + .log2_tile_rows = log2_tile_rows, + .segment_enabled = segmentation.enabled != 0, + .segment_map_update = segmentation.update_map != 0, + .segment_map_temporal_update = segmentation.temporal_update != 0, + .segment_abs_delta = segmentation.abs_delta, + .segment_feature_enable = segmentation.feature_mask, + .segment_feature_data = segmentation.feature_data, + .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, + .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) && + !(vp9_flags == (FrameFlags::FrameSizeChanged)) && + !(vp9_flags == (FrameFlags::IntraOnly)) && + (vp9_flags == (FrameFlags::LastShowFrame)) && + !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)), + .ref_deltas = loop_filter.ref_deltas, + .mode_deltas = loop_filter.mode_deltas, + .frame_size = current_frame_size, + .first_level = first_level, + .sharpness_level = sharpness_level, + .bitstream_size = bitstream_size, + }; + } +}; +static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); + +struct EntropyProbs { + INSERT_PADDING_BYTES(1024); + std::array<u8, 28> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + INSERT_PADDING_BYTES(80); + std::array<u8, 2> tx_8x8_prob{}; + std::array<u8, 4> tx_16x16_prob{}; + std::array<u8, 6> tx_32x32_prob{}; + std::array<u8, 4> y_mode_prob_e8{}; + std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{}; + INSERT_PADDING_BYTES(64); + std::array<u8, 64> partition_prob{}; + INSERT_PADDING_BYTES(10); + std::array<u8, 8> switchable_interp_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 3> skip_probs{}; + INSERT_PADDING_BYTES(1); + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<u8, 2> class_0{}; + std::array<u8, 6> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; + std::array<u8, 20> classes{}; + std::array<u8, 12> class_0_fr{}; + std::array<u8, 20> pred_bits{}; + std::array<u8, 10> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + INSERT_PADDING_BYTES(17); + std::array<u8, 2304> coef_probs{}; + + void Convert(Vp9EntropyProbs& fc) { + fc.inter_mode_prob = inter_mode_prob; + fc.intra_inter_prob = intra_inter_prob; + fc.tx_8x8_prob = tx_8x8_prob; + fc.tx_16x16_prob = tx_16x16_prob; + fc.tx_32x32_prob = tx_32x32_prob; + + for (std::size_t i = 0; i < 4; i++) { + for (std::size_t j = 0; j < 9; j++) { + fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; + } + } + + fc.partition_prob = partition_prob; + fc.switchable_interp_prob = switchable_interp_prob; + fc.comp_inter_prob = comp_inter_prob; + fc.skip_probs = skip_probs; + fc.joints = joints; + fc.sign = sign; + fc.class_0 = class_0; + fc.fr = fr; + fc.class_0_hp = class_0_hp; + fc.high_precision = high_precision; + fc.classes = classes; + fc.class_0_fr = class_0_fr; + fc.prob_bits = pred_bits; + fc.single_ref_prob = single_ref_prob; + fc.comp_ref_prob = comp_ref_prob; + + // Skip the 4th element as it goes unused + for (std::size_t i = 0; i < coef_probs.size(); i += 4) { + const std::size_t j = i - i / 4; + fc.coef_probs[j] = coef_probs[i]; + fc.coef_probs[j + 1] = coef_probs[i + 1]; + fc.coef_probs[j + 2] = coef_probs[i + 2]; + } + } +}; +static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); + +enum class Ref { Last, Golden, AltRef }; + +struct RefPoolElement { + s64 frame{}; + Ref ref{}; + bool refresh{}; +}; + +struct FrameContexts { + s64 from{}; + bool adapted{}; + Vp9EntropyProbs probs{}; +}; + +}; // namespace Decoder +}; // namespace Tegra diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp new file mode 100644 index 000000000..b12494528 --- /dev/null +++ b/src/video_core/command_classes/host1x.cpp @@ -0,0 +1,30 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/host1x.h" +#include "video_core/gpu.h" + +Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {} + +Tegra::Host1x::~Host1x() = default; + +void Tegra::Host1x::ProcessMethod(Method method, u32 argument) { + switch (method) { + case Method::LoadSyncptPayload32: + syncpoint_value = argument; + break; + case Method::WaitSyncpt: + case Method::WaitSyncpt32: + Execute(argument); + break; + default: + UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method)); + break; + } +} + +void Tegra::Host1x::Execute(u32 data) { + gpu.WaitFence(data, syncpoint_value); +} diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h new file mode 100644 index 000000000..7e94799dd --- /dev/null +++ b/src/video_core/command_classes/host1x.h @@ -0,0 +1,37 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; +class Nvdec; + +class Host1x { +public: + enum class Method : u32 { + WaitSyncpt = 0x8, + LoadSyncptPayload32 = 0x4e, + WaitSyncpt32 = 0x50, + }; + + explicit Host1x(GPU& gpu); + ~Host1x(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, u32 argument); + +private: + /// For Host1x, execute is waiting on a syncpoint previously written into the state + void Execute(u32 data); + + u32 syncpoint_value{}; + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp new file mode 100644 index 000000000..79e1f4e13 --- /dev/null +++ b/src/video_core/command_classes/nvdec.cpp @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/gpu.h" + +namespace Tegra { + +Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} + +Nvdec::~Nvdec() = default; + +void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { + if (method == Method::SetVideoCodec) { + codec->StateWrite(static_cast<u32>(method), arguments[0]); + } else { + codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); + } + + switch (method) { + case Method::SetVideoCodec: + codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); + break; + case Method::Execute: + Execute(); + break; + } +} + +AVFramePtr Nvdec::GetFrame() { + return codec->GetCurrentFrame(); +} + +void Nvdec::Execute() { + switch (codec->GetCurrentCodec()) { + case NvdecCommon::VideoCodec::H264: + case NvdecCommon::VideoCodec::Vp9: + codec->Decode(); + break; + default: + UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec())); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h new file mode 100644 index 000000000..e4877c533 --- /dev/null +++ b/src/video_core/command_classes/nvdec.h @@ -0,0 +1,38 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/common_types.h" +#include "video_core/command_classes/codecs/codec.h" + +namespace Tegra { +class GPU; + +class Nvdec { +public: + enum class Method : u32 { + SetVideoCodec = 0x80, + Execute = 0xc0, + }; + + explicit Nvdec(GPU& gpu); + ~Nvdec(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, const std::vector<u32>& arguments); + + /// Return most recently decoded frame + [[nodiscard]] AVFramePtr GetFrame(); + +private: + /// Invoke codec to decode a frame + void Execute(); + + GPU& gpu; + std::unique_ptr<Codec> codec; +}; +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h new file mode 100644 index 000000000..01b5e086d --- /dev/null +++ b/src/video_core/command_classes/nvdec_common.h @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::NvdecCommon { + +struct NvdecRegisters { + INSERT_PADDING_WORDS(256); + u64 set_codec_id{}; + INSERT_PADDING_WORDS(254); + u64 set_platform_id{}; + u64 picture_info_offset{}; + u64 frame_bitstream_offset{}; + u64 frame_number{}; + u64 h264_slice_data_offsets{}; + u64 h264_mv_dump_offset{}; + INSERT_PADDING_WORDS(6); + u64 frame_stats_offset{}; + u64 h264_last_surface_luma_offset{}; + u64 h264_last_surface_chroma_offset{}; + std::array<u64, 17> surface_luma_offset{}; + std::array<u64, 17> surface_chroma_offset{}; + INSERT_PADDING_WORDS(132); + u64 vp9_entropy_probs_offset{}; + u64 vp9_backward_updates_offset{}; + u64 vp9_last_frame_segmap_offset{}; + u64 vp9_curr_frame_segmap_offset{}; + INSERT_PADDING_WORDS(2); + u64 vp9_last_frame_mvs_offset{}; + u64 vp9_curr_frame_mvs_offset{}; + INSERT_PADDING_WORDS(2); +}; +static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); + +enum class VideoCodec : u32 { + None = 0x0, + H264 = 0x3, + Vp8 = 0x5, + H265 = 0x7, + Vp9 = 0x9, +}; + +} // namespace Tegra::NvdecCommon diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp new file mode 100644 index 000000000..19dc9e0ab --- /dev/null +++ b/src/video_core/command_classes/sync_manager.cpp @@ -0,0 +1,60 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include "sync_manager.h" +#include "video_core/gpu.h" + +namespace Tegra { +SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} +SyncptIncrManager::~SyncptIncrManager() = default; + +void SyncptIncrManager::Increment(u32 id) { + increments.emplace_back(0, 0, id, true); + IncrementAllDone(); +} + +u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { + const u32 handle = current_id++; + increments.emplace_back(handle, class_id, id); + return handle; +} + +void SyncptIncrManager::SignalDone(u32 handle) { + const auto done_incr = + std::find_if(increments.begin(), increments.end(), + [handle](const SyncptIncr& incr) { return incr.id == handle; }); + if (done_incr != increments.cend()) { + done_incr->complete = true; + } + IncrementAllDone(); +} + +void SyncptIncrManager::IncrementAllDone() { + std::size_t done_count = 0; + for (; done_count < increments.size(); ++done_count) { + if (!increments[done_count].complete) { + break; + } + gpu.IncrementSyncPoint(increments[done_count].syncpt_id); + } + increments.erase(increments.begin(), increments.begin() + done_count); +} +} // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h new file mode 100644 index 000000000..2c321ec58 --- /dev/null +++ b/src/video_core/command_classes/sync_manager.h @@ -0,0 +1,64 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <mutex> +#include <vector> +#include "common/common_types.h" + +namespace Tegra { +class GPU; +struct SyncptIncr { + u32 id; + u32 class_id; + u32 syncpt_id; + bool complete; + + SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) + : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} +}; + +class SyncptIncrManager { +public: + explicit SyncptIncrManager(GPU& gpu); + ~SyncptIncrManager(); + + /// Add syncpoint id and increment all + void Increment(u32 id); + + /// Returns a handle to increment later + u32 IncrementWhenDone(u32 class_id, u32 id); + + /// IncrememntAllDone, including handle + void SignalDone(u32 handle); + + /// Increment all sequential pending increments that are already done. + void IncrementAllDone(); + +private: + std::vector<SyncptIncr> increments; + std::mutex increment_lock; + u32 current_id{}; + + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp new file mode 100644 index 000000000..2b7569335 --- /dev/null +++ b/src/video_core/command_classes/vic.cpp @@ -0,0 +1,172 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/command_classes/vic.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/textures/decoders.h" + +extern "C" { +#include <libswscale/swscale.h> +} + +namespace Tegra { + +Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) + : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} +Vic::~Vic() = default; + +void Vic::VicStateWrite(u32 offset, u32 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); + std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { + LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method); + VicStateWrite(static_cast<u32>(method), arguments[0]); + const u64 arg = static_cast<u64>(arguments[0]) << 8; + switch (method) { + case Method::Execute: + Execute(); + break; + case Method::SetConfigStructOffset: + config_struct_address = arg; + break; + case Method::SetOutputSurfaceLumaOffset: + output_surface_luma_address = arg; + break; + case Method::SetOutputSurfaceChromaUOffset: + output_surface_chroma_u_address = arg; + break; + case Method::SetOutputSurfaceChromaVOffset: + output_surface_chroma_v_address = arg; + break; + default: + break; + } +} + +void Vic::Execute() { + if (output_surface_luma_address == 0) { + LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}", + vic_state.output_surface.luma_offset); + return; + } + const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; + const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); + const auto* frame = frame_ptr.get(); + if (!frame || frame->width == 0 || frame->height == 0) { + return; + } + const VideoPixelFormat pixel_format = + static_cast<VideoPixelFormat>(config.pixel_format.Value()); + switch (pixel_format) { + case VideoPixelFormat::BGRA8: + case VideoPixelFormat::RGBA8: { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + + if (scaler_ctx == nullptr || frame->width != scaler_width || + frame->height != scaler_height) { + const AVPixelFormat target_format = + (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA; + + sws_freeContext(scaler_ctx); + scaler_ctx = nullptr; + + // FFmpeg returns all frames in YUV420, convert it into expected format + scaler_ctx = + sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); + + scaler_width = frame->width; + scaler_height = frame->height; + } + // Get Converted frame + const std::size_t linear_size = frame->width * frame->height * 4; + + using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; + AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; + + const int converted_stride{frame->width * 4}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + + sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, + &converted_frame_buf_addr, &converted_stride); + + const u32 blk_kind = static_cast<u32>(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast<u32>(config.block_linear_height_log2); + const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, + block_height, 0); + std::vector<u8> swizzled_data(size); + Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, + frame->width, 4, swizzled_data.data(), + converted_frame_buffer.get(), block_height, 0, 0); + + gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); + } else { + // send pitch linear frame + gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); + } + break; + } + case VideoPixelFormat::Yuv420: { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); + + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t half_width = surface_width / 2; + const std::size_t half_height = config.surface_height_minus1 / 2; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; + + const auto* luma_ptr = frame->data[0]; + const auto* chroma_b_ptr = frame->data[1]; + const auto* chroma_r_ptr = frame->data[2]; + const auto stride = frame->linesize[0]; + const auto half_stride = frame->linesize[1]; + + std::vector<u8> luma_buffer(aligned_width * surface_height); + std::vector<u8> chroma_buffer(aligned_width * half_height); + + // Populate luma buffer + for (std::size_t y = 0; y < surface_height - 1; ++y) { + std::size_t src = y * stride; + std::size_t dst = y * aligned_width; + + std::size_t size = surface_width; + + for (std::size_t offset = 0; offset < size; ++offset) { + luma_buffer[dst + offset] = luma_ptr[src + offset]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), + luma_buffer.size()); + + // Populate chroma buffer from both channels with interleaving. + for (std::size_t y = 0; y < half_height; ++y) { + std::size_t src = y * half_stride; + std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), + chroma_buffer.size()); + break; + } + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value()); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h new file mode 100644 index 000000000..8c4e284a1 --- /dev/null +++ b/src/video_core/command_classes/vic.h @@ -0,0 +1,110 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +struct SwsContext; + +namespace Tegra { +class GPU; +class Nvdec; + +struct PlaneOffsets { + u32 luma_offset{}; + u32 chroma_u_offset{}; + u32 chroma_v_offset{}; +}; + +struct VicRegisters { + INSERT_PADDING_WORDS(64); + u32 nop{}; + INSERT_PADDING_WORDS(15); + u32 pm_trigger{}; + INSERT_PADDING_WORDS(47); + u32 set_application_id{}; + u32 set_watchdog_timer{}; + INSERT_PADDING_WORDS(17); + u32 context_save_area{}; + u32 context_switch{}; + INSERT_PADDING_WORDS(43); + u32 execute{}; + INSERT_PADDING_WORDS(63); + std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; + u32 picture_index{}; + u32 control_params{}; + u32 config_struct_offset{}; + u32 filter_struct_offset{}; + u32 palette_offset{}; + u32 hist_offset{}; + u32 context_id{}; + u32 fce_ucode_size{}; + PlaneOffsets output_surface{}; + u32 fce_ucode_offset{}; + INSERT_PADDING_WORDS(4); + std::array<u32, 8> slot_context_id{}; + INSERT_PADDING_WORDS(16); +}; +static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); + +class Vic { +public: + enum class Method : u32 { + Execute = 0xc0, + SetControlParams = 0x1c1, + SetConfigStructOffset = 0x1c2, + SetOutputSurfaceLumaOffset = 0x1c8, + SetOutputSurfaceChromaUOffset = 0x1c9, + SetOutputSurfaceChromaVOffset = 0x1ca + }; + + explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); + ~Vic(); + + /// Write to the device state. + void ProcessMethod(Method method, const std::vector<u32>& arguments); + +private: + void Execute(); + + void VicStateWrite(u32 offset, u32 arguments); + VicRegisters vic_state{}; + + enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + Yuv420 = 0x44, + }; + + union VicConfig { + u64_le raw{}; + BitField<0, 7, u64_le> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<19, 3, u64_le> reserved0; + BitField<22, 10, u64_le> reserved1; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; + }; + + GPU& gpu; + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + + GPUVAddr config_struct_address{}; + GPUVAddr output_surface_luma_address{}; + GPUVAddr output_surface_chroma_u_address{}; + GPUVAddr output_surface_chroma_v_address{}; + + SwsContext* scaler_ctx{}; + s32 scaler_width{}; + s32 scaler_height{}; +}; + +} // namespace Tegra diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp index b06c32c84..acf2668dc 100644 --- a/src/video_core/compatible_formats.cpp +++ b/src/video_core/compatible_formats.cpp @@ -3,33 +3,33 @@ // Refer to the license.txt file included. #include <array> -#include <bitset> #include <cstddef> +#include "common/common_types.h" #include "video_core/compatible_formats.h" #include "video_core/surface.h" namespace VideoCore::Surface { - namespace { +using Table = std::array<std::array<u64, 2>, MaxPixelFormat>; // Compatibility table taken from Table 3.X.2 in: // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt -constexpr std::array VIEW_CLASS_128_BITS = { +constexpr std::array VIEW_CLASS_128_BITS{ PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_SINT, }; -constexpr std::array VIEW_CLASS_96_BITS = { +constexpr std::array VIEW_CLASS_96_BITS{ PixelFormat::R32G32B32_FLOAT, }; // Missing formats: // PixelFormat::RGB32UI, // PixelFormat::RGB32I, -constexpr std::array VIEW_CLASS_64_BITS = { +constexpr std::array VIEW_CLASS_64_BITS{ PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_UINT, PixelFormat::R32G32_SINT, PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, @@ -38,7 +38,7 @@ constexpr std::array VIEW_CLASS_64_BITS = { // TODO: How should we handle 48 bits? -constexpr std::array VIEW_CLASS_32_BITS = { +constexpr std::array VIEW_CLASS_32_BITS{ PixelFormat::R16G16_FLOAT, PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT, PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT, PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM, @@ -50,43 +50,105 @@ constexpr std::array VIEW_CLASS_32_BITS = { // TODO: How should we handle 24 bits? -constexpr std::array VIEW_CLASS_16_BITS = { +constexpr std::array VIEW_CLASS_16_BITS{ PixelFormat::R16_FLOAT, PixelFormat::R8G8_UINT, PixelFormat::R16_UINT, PixelFormat::R16_SINT, PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM, PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM, PixelFormat::R8G8_SINT, }; -constexpr std::array VIEW_CLASS_8_BITS = { +constexpr std::array VIEW_CLASS_8_BITS{ PixelFormat::R8_UINT, PixelFormat::R8_UNORM, PixelFormat::R8_SINT, PixelFormat::R8_SNORM, }; -constexpr std::array VIEW_CLASS_RGTC1_RED = { +constexpr std::array VIEW_CLASS_RGTC1_RED{ PixelFormat::BC4_UNORM, PixelFormat::BC4_SNORM, }; -constexpr std::array VIEW_CLASS_RGTC2_RG = { +constexpr std::array VIEW_CLASS_RGTC2_RG{ PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM, }; -constexpr std::array VIEW_CLASS_BPTC_UNORM = { +constexpr std::array VIEW_CLASS_BPTC_UNORM{ PixelFormat::BC7_UNORM, PixelFormat::BC7_SRGB, }; -constexpr std::array VIEW_CLASS_BPTC_FLOAT = { +constexpr std::array VIEW_CLASS_BPTC_FLOAT{ PixelFormat::BC6H_SFLOAT, PixelFormat::BC6H_UFLOAT, }; +constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{ + PixelFormat::ASTC_2D_4X4_UNORM, + PixelFormat::ASTC_2D_4X4_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{ + PixelFormat::ASTC_2D_5X4_UNORM, + PixelFormat::ASTC_2D_5X4_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{ + PixelFormat::ASTC_2D_5X5_UNORM, + PixelFormat::ASTC_2D_5X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{ + PixelFormat::ASTC_2D_6X5_UNORM, + PixelFormat::ASTC_2D_6X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{ + PixelFormat::ASTC_2D_6X6_UNORM, + PixelFormat::ASTC_2D_6X6_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{ + PixelFormat::ASTC_2D_8X5_UNORM, + PixelFormat::ASTC_2D_8X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{ + PixelFormat::ASTC_2D_8X8_UNORM, + PixelFormat::ASTC_2D_8X8_SRGB, +}; + +// Missing formats: +// PixelFormat::ASTC_2D_10X5_UNORM +// PixelFormat::ASTC_2D_10X5_SRGB + +// Missing formats: +// PixelFormat::ASTC_2D_10X6_UNORM +// PixelFormat::ASTC_2D_10X6_SRGB + +constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{ + PixelFormat::ASTC_2D_10X8_UNORM, + PixelFormat::ASTC_2D_10X8_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{ + PixelFormat::ASTC_2D_10X10_UNORM, + PixelFormat::ASTC_2D_10X10_SRGB, +}; + +// Missing formats +// ASTC_2D_12X10_UNORM, +// ASTC_2D_12X10_SRGB, + +constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{ + PixelFormat::ASTC_2D_12X12_UNORM, + PixelFormat::ASTC_2D_12X12_SRGB, +}; + // Compatibility table taken from Table 4.X.1 in: // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt -constexpr std::array COPY_CLASS_128_BITS = { +constexpr std::array COPY_CLASS_128_BITS{ PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT, PixelFormat::BC2_UNORM, PixelFormat::BC2_SRGB, PixelFormat::BC3_UNORM, PixelFormat::BC3_SRGB, PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM, @@ -97,7 +159,7 @@ constexpr std::array COPY_CLASS_128_BITS = { // PixelFormat::RGBA32I // COMPRESSED_RG_RGTC2 -constexpr std::array COPY_CLASS_64_BITS = { +constexpr std::array COPY_CLASS_64_BITS{ PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT, PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, PixelFormat::R16G16B16A16_SINT, PixelFormat::R32G32_UINT, @@ -110,32 +172,36 @@ constexpr std::array COPY_CLASS_64_BITS = { // COMPRESSED_RGBA_S3TC_DXT1_EXT // COMPRESSED_SIGNED_RED_RGTC1 -void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) { - compatiblity[format_a][format_b] = true; - compatiblity[format_b][format_a] = true; +constexpr void Enable(Table& table, size_t format_a, size_t format_b) { + table[format_a][format_b / 64] |= u64(1) << (format_b % 64); + table[format_b][format_a / 64] |= u64(1) << (format_a % 64); } -void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) { - Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); +constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) { + Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); } template <typename Range> -void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) { +constexpr void EnableRange(Table& table, const Range& range) { for (auto it_a = range.begin(); it_a != range.end(); ++it_a) { for (auto it_b = it_a; it_b != range.end(); ++it_b) { - Enable(compatibility, *it_a, *it_b); + Enable(table, *it_a, *it_b); } } } -} // Anonymous namespace +constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) { + const size_t a = static_cast<size_t>(format_a); + const size_t b = static_cast<size_t>(format_b); + return ((table[a][b / 64] >> (b % 64)) & 1) != 0; +} -FormatCompatibility::FormatCompatibility() { +constexpr Table MakeViewTable() { + Table view{}; for (size_t i = 0; i < MaxPixelFormat; ++i) { // Identity is allowed Enable(view, i, i); } - EnableRange(view, VIEW_CLASS_128_BITS); EnableRange(view, VIEW_CLASS_96_BITS); EnableRange(view, VIEW_CLASS_64_BITS); @@ -146,10 +212,39 @@ FormatCompatibility::FormatCompatibility() { EnableRange(view, VIEW_CLASS_RGTC2_RG); EnableRange(view, VIEW_CLASS_BPTC_UNORM); EnableRange(view, VIEW_CLASS_BPTC_FLOAT); + EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA); + return view; +} - copy = view; +constexpr Table MakeCopyTable() { + Table copy = MakeViewTable(); EnableRange(copy, COPY_CLASS_128_BITS); EnableRange(copy, COPY_CLASS_64_BITS); + return copy; +} +} // Anonymous namespace + +bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views) { + if (broken_views) { + // If format views are broken, only accept formats that are identical. + return format_a == format_b; + } + static constexpr Table TABLE = MakeViewTable(); + return IsSupported(TABLE, format_a, format_b); +} + +bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) { + static constexpr Table TABLE = MakeCopyTable(); + return IsSupported(TABLE, format_a, format_b); } } // namespace VideoCore::Surface diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h index 51766349b..9a0522988 100644 --- a/src/video_core/compatible_formats.h +++ b/src/video_core/compatible_formats.h @@ -4,31 +4,12 @@ #pragma once -#include <array> -#include <bitset> -#include <cstddef> - #include "video_core/surface.h" namespace VideoCore::Surface { -class FormatCompatibility { -public: - using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>; - - explicit FormatCompatibility(); - - bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept { - return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; - } - - bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept { - return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; - } +bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views); -private: - Table view; - Table copy; -}; +bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b); } // namespace VideoCore::Surface diff --git a/src/video_core/delayed_destruction_ring.h b/src/video_core/delayed_destruction_ring.h new file mode 100644 index 000000000..4f1d29c04 --- /dev/null +++ b/src/video_core/delayed_destruction_ring.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstddef> +#include <utility> +#include <vector> + +namespace VideoCommon { + +/// Container to push objects to be destroyed a few ticks in the future +template <typename T, size_t TICKS_TO_DESTROY> +class DelayedDestructionRing { +public: + void Tick() { + index = (index + 1) % TICKS_TO_DESTROY; + elements[index].clear(); + } + + void Push(T&& object) { + elements[index].push_back(std::move(object)); + } + +private: + size_t index = 0; + std::array<std::vector<T>, TICKS_TO_DESTROY> elements; +}; + +} // namespace VideoCommon diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp index e16075993..7149af290 100644 --- a/src/video_core/dirty_flags.cpp +++ b/src/video_core/dirty_flags.cpp @@ -9,13 +9,33 @@ #include "video_core/dirty_flags.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace VideoCommon::Dirty { - +namespace { using Tegra::Engines::Maxwell3D; -void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { +void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) { + static constexpr std::size_t num_array = 3; + for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) { + const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); + const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); + + FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); + FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); + } +} + +void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) { + FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer); +} + +void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) { + FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors); + FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors); +} + +void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) { static constexpr std::size_t num_per_rt = NUM(rt[0]); static constexpr std::size_t begin = OFF(rt); static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; @@ -23,6 +43,10 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt); } FillBlock(tables[1], begin, num, RenderTargets); + FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets); + + tables[0][OFF(rt_control)] = RenderTargets; + tables[1][OFF(rt_control)] = RenderTargetControl; static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets}; for (std::size_t i = 0; i < std::size(zeta_flags); ++i) { @@ -34,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl FillBlock(table, OFF(zeta), NUM(zeta), flag); } } +} // Anonymous namespace + +void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) { + SetupDirtyVertexBuffers(tables); + SetupIndexBuffer(tables); + SetupDirtyDescriptors(tables); + SetupDirtyRenderTargets(tables); +} } // namespace VideoCommon::Dirty diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h index 3f6c1d83a..702688ace 100644 --- a/src/video_core/dirty_flags.h +++ b/src/video_core/dirty_flags.h @@ -16,7 +16,10 @@ namespace VideoCommon::Dirty { enum : u8 { NullEntry = 0, + Descriptors, + RenderTargets, + RenderTargetControl, ColorBuffer0, ColorBuffer1, ColorBuffer2, @@ -27,6 +30,12 @@ enum : u8 { ColorBuffer7, ZetaBuffer, + VertexBuffers, + VertexBuffer0, + VertexBuffer31 = VertexBuffer0 + 31, + + IndexBuffer, + LastCommonEntry, }; @@ -44,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_ FillBlock(tables[1], begin, num, index_b); } -void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); +void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); } // namespace VideoCommon::Dirty diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index f2f96ac33..8b33c04ab 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -12,7 +13,7 @@ namespace Tegra { -DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {} +DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {} DmaPusher::~DmaPusher() = default; @@ -22,8 +23,6 @@ void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); gpu.SyncGuestHost(); - // On entering GPU code, assume all memory may be touched by the ARM core. - gpu.Maxwell3D().OnMemoryWrite(); dma_pushbuffer_subindex = 0; @@ -45,32 +44,41 @@ bool DmaPusher::Step() { return false; } - const CommandList& command_list{dma_pushbuffer.front()}; - ASSERT_OR_EXECUTE(!command_list.empty(), { - // Somehow the command_list is empty, in order to avoid a crash - // We ignore it and assume its size is 0. - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - return true; - }); - const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]}; - const GPUVAddr dma_get = command_list_header.addr; - - if (dma_pushbuffer_subindex >= command_list.size()) { - // We've gone through the current list, remove it from the queue - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - } + CommandList& command_list{dma_pushbuffer.front()}; - if (command_list_header.size == 0) { - return true; - } + ASSERT_OR_EXECUTE( + command_list.command_lists.size() || command_list.prefetch_command_list.size(), { + // Somehow the command_list is empty, in order to avoid a crash + // We ignore it and assume its size is 0. + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + return true; + }); + + if (command_list.prefetch_command_list.size()) { + // Prefetched command list from nvdrv, used for things like synchronization + command_headers = std::move(command_list.prefetch_command_list); + dma_pushbuffer.pop(); + } else { + const CommandListHeader command_list_header{ + command_list.command_lists[dma_pushbuffer_subindex++]}; + const GPUVAddr dma_get = command_list_header.addr; + + if (dma_pushbuffer_subindex >= command_list.command_lists.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } - // Push buffer non-empty, read a word - command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + if (command_list_header.size == 0) { + return true; + } + // Push buffer non-empty, read a word + command_headers.resize(command_list_header.size); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); + } for (std::size_t index = 0; index < command_headers.size();) { const CommandHeader& command_header = command_headers[index]; @@ -142,7 +150,12 @@ void DmaPusher::SetState(const CommandHeader& command_header) { void DmaPusher::CallMethod(u32 argument) const { if (dma_state.method < non_puller_methods) { - gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); + gpu.CallMethod(GPU::MethodCall{ + dma_state.method, + argument, + dma_state.subchannel, + dma_state.method_count, + }); } else { subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument, dma_state.is_last_call); diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index efa90d170..19f286fa7 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -18,6 +18,8 @@ class System; namespace Tegra { +class GPU; + enum class SubmissionMode : u32 { IncreasingOld = 0, Increasing = 1, @@ -27,6 +29,31 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence +// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. +// So the values you see in docs might be multiplied by 4. +enum class BufferMethods : u32 { + BindObject = 0x0, + Nop = 0x2, + SemaphoreAddressHigh = 0x4, + SemaphoreAddressLow = 0x5, + SemaphoreSequence = 0x6, + SemaphoreTrigger = 0x7, + NotifyIntr = 0x8, + WrcacheFlush = 0x9, + Unk28 = 0xA, + UnkCacheFlush = 0xB, + RefCnt = 0x14, + SemaphoreAcquire = 0x1A, + SemaphoreRelease = 0x1B, + FenceValue = 0x1C, + FenceAction = 0x1D, + WaitForInterrupt = 0x1E, + Unk7c = 0x1F, + Yield = 0x20, + NonPullerMethods = 0x40, +}; + struct CommandListHeader { union { u64 raw; @@ -49,9 +76,23 @@ union CommandHeader { static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); -class GPU; +inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) { + CommandHeader result{}; + result.method.Assign(static_cast<u32>(method)); + result.arg_count.Assign(arg_count); + result.mode.Assign(mode); + return result; +} -using CommandList = std::vector<Tegra::CommandListHeader>; +struct CommandList final { + CommandList() = default; + explicit CommandList(std::size_t size) : command_lists(size) {} + explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_) + : prefetch_command_list{std::move(prefetch_command_list_)} {} + + std::vector<CommandListHeader> command_lists; + std::vector<CommandHeader> prefetch_command_list; +}; /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the @@ -60,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>; * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for * details on this implementation. */ -class DmaPusher { +class DmaPusher final { public: - explicit DmaPusher(Core::System& system, GPU& gpu); + explicit DmaPusher(Core::System& system_, GPU& gpu_); ~DmaPusher(); void Push(CommandList&& entries) { @@ -71,7 +112,7 @@ public: void DispatchCalls(); - void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) { + void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) { subchannels[subchannel_id] = engine; } @@ -104,7 +145,7 @@ private: bool ib_enable{true}; ///< IB mode enabled - std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{}; + std::array<Engines::EngineInterface*, max_subchannels> subchannels{}; GPU& gpu; Core::System& system; diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index d44ad0cd8..71d7e1473 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -11,16 +11,16 @@ namespace Tegra::Engines::Upload { -State::State(MemoryManager& memory_manager, Registers& regs) - : regs{regs}, memory_manager{memory_manager} {} +State::State(MemoryManager& memory_manager_, Registers& regs_) + : regs{regs_}, memory_manager{memory_manager_} {} State::~State() = default; -void State::ProcessExec(const bool is_linear) { +void State::ProcessExec(const bool is_linear_) { write_offset = 0; copy_size = regs.line_length_in * regs.line_count; inner_buffer.resize(copy_size); - this->is_linear = is_linear; + is_linear = is_linear_; } void State::ProcessData(const u32 data, const bool is_last_call) { diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h index 462da419e..1c7f1effa 100644 --- a/src/video_core/engines/engine_upload.h +++ b/src/video_core/engines/engine_upload.h @@ -54,10 +54,10 @@ struct Registers { class State { public: - State(MemoryManager& memory_manager, Registers& regs); + explicit State(MemoryManager& memory_manager_, Registers& regs_); ~State(); - void ProcessExec(bool is_linear); + void ProcessExec(bool is_linear_); void ProcessData(u32 data, bool is_last_call); private: diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 6e50661a3..0f640fdae 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -10,90 +10,58 @@ namespace Tegra::Engines { -Fermi2D::Fermi2D() = default; +Fermi2D::Fermi2D() { + // Nvidia's OpenGL driver seems to assume these values + regs.src.depth = 1; + regs.dst.depth = 1; +} Fermi2D::~Fermi2D() = default; -void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Fermi2D register, increase the size of the Regs structure"); - regs.reg_array[method] = method_argument; - switch (method) { - // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit, - // so trigger on the second 32-bit write. - case FERMI2D_REG_INDEX(blit_src_y) + 1: { - HandleSurfaceCopy(); - break; - } + if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) { + Blit(); } } void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) { - for (std::size_t i = 0; i < amount; i++) { - CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + for (u32 i = 0; i < amount; ++i) { + CallMethod(method, base_start[i], methods_pending - i <= 1); } } -static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) { - const u32 line_a = src_2 - src_1; - const u32 line_b = dst_2 - dst_1; - const u32 excess = std::max<s32>(0, line_a - src_line + src_1); - return {line_b - (excess * line_b) / line_a, excess}; -} +void Fermi2D::Blit() { + LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}", + regs.src.Address(), regs.dst.Address()); -void Fermi2D::HandleSurfaceCopy() { - LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}", - static_cast<u32>(regs.operation)); - - // TODO(Subv): Only raw copies are implemented. - ASSERT(regs.operation == Operation::SrcCopy); - - const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)}; - const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)}; - u32 src_blit_x2, src_blit_y2; - if (regs.blit_control.origin == Origin::Corner) { - src_blit_x2 = - static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32); - src_blit_y2 = - static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32); - } else { - src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width); - src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height); - } - u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width; - u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height; - const auto [new_dst_w, src_excess_x] = - DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width); - const auto [new_dst_h, src_excess_y] = - DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height); - dst_blit_x2 = new_dst_w + regs.blit_dst_x; - src_blit_x2 = src_blit_x2 - src_excess_x; - dst_blit_y2 = new_dst_h + regs.blit_dst_y; - src_blit_y2 = src_blit_y2 - src_excess_y; - const auto [new_src_w, dst_excess_x] = - DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width); - const auto [new_src_h, dst_excess_y] = - DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height); - src_blit_x2 = new_src_w + src_blit_x1; - dst_blit_x2 = dst_blit_x2 - dst_excess_x; - src_blit_y2 = new_src_h + src_blit_y1; - dst_blit_y2 = dst_blit_y2 - dst_excess_y; - const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2}; - const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2, - dst_blit_y2}; - Config copy_config; - copy_config.operation = regs.operation; - copy_config.filter = regs.blit_control.filter; - copy_config.src_rect = src_rect; - copy_config.dst_rect = dst_rect; + UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy"); + UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero"); + UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero"); + UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one"); + UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled"); - if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) { + const auto& args = regs.pixels_from_memory; + const Config config{ + .operation = regs.operation, + .filter = args.sample_mode.filter, + .dst_x0 = args.dst_x0, + .dst_y0 = args.dst_y0, + .dst_x1 = args.dst_x0 + args.dst_width, + .dst_y1 = args.dst_y0 + args.dst_height, + .src_x0 = static_cast<s32>(args.src_x0 >> 32), + .src_y0 = static_cast<s32>(args.src_y0 >> 32), + .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32), + .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32), + }; + if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) { UNIMPLEMENTED(); } } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 213abfaae..c808a577d 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -38,7 +38,7 @@ public: ~Fermi2D(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); /// Write the value to the register identified by method. void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; @@ -53,8 +53,8 @@ public: }; enum class Filter : u32 { - PointSample = 0, // Nearest - Linear = 1, + Point = 0, + Bilinear = 1, }; enum class Operation : u32 { @@ -67,88 +67,235 @@ public: BlendPremult = 6, }; - struct Regs { - static constexpr std::size_t NUM_REGS = 0x258; + enum class MemoryLayout : u32 { + BlockLinear = 0, + Pitch = 1, + }; - struct Surface { - RenderTargetFormat format; - BitField<0, 1, u32> linear; - union { - BitField<0, 4, u32> block_width; - BitField<4, 4, u32> block_height; - BitField<8, 4, u32> block_depth; - }; - u32 depth; - u32 layer; - u32 pitch; - u32 width; - u32 height; - u32 address_high; - u32 address_low; - - GPUVAddr Address() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - - u32 BlockWidth() const { - return block_width.Value(); - } - - u32 BlockHeight() const { - return block_height.Value(); - } - - u32 BlockDepth() const { - return block_depth.Value(); - } - }; - static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size"); + enum class CpuIndexWrap : u32 { + Wrap = 0, + NoWrap = 1, + }; + struct Surface { + RenderTargetFormat format; + MemoryLayout linear; union { - struct { - INSERT_UNION_PADDING_WORDS(0x80); + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + }; + u32 depth; + u32 layer; + u32 pitch; + u32 width; + u32 height; + u32 addr_upper; + u32 addr_lower; + + [[nodiscard]] constexpr GPUVAddr Address() const noexcept { + return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower); + } + }; + static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size"); - Surface dst; + enum class SectorPromotion : u32 { + NoPromotion = 0, + PromoteTo2V = 1, + PromoteTo2H = 2, + PromoteTo4 = 3, + }; - INSERT_UNION_PADDING_WORDS(2); + enum class NumTpcs : u32 { + All = 0, + One = 1, + }; - Surface src; + enum class RenderEnableMode : u32 { + False = 0, + True = 1, + Conditional = 2, + RenderIfEqual = 3, + RenderIfNotEqual = 4, + }; - INSERT_UNION_PADDING_WORDS(0x15); + enum class ColorKeyFormat : u32 { + A16R56G6B5 = 0, + A1R5G55B5 = 1, + A8R8G8B8 = 2, + A2R10G10B10 = 3, + Y8 = 4, + Y16 = 5, + Y32 = 6, + }; - Operation operation; + union Beta4 { + BitField<0, 8, u32> b; + BitField<8, 8, u32> g; + BitField<16, 8, u32> r; + BitField<24, 8, u32> a; + }; - INSERT_UNION_PADDING_WORDS(0x177); + struct Point { + u32 x; + u32 y; + }; + enum class PatternSelect : u32 { + MonoChrome8x8 = 0, + MonoChrome64x1 = 1, + MonoChrome1x64 = 2, + Color = 3, + }; + + enum class NotifyType : u32 { + WriteOnly = 0, + WriteThenAwaken = 1, + }; + + enum class MonochromePatternColorFormat : u32 { + A8X8R8G6B5 = 0, + A1R5G5B5 = 1, + A8R8G8B8 = 2, + A8Y8 = 3, + A8X8Y16 = 4, + Y32 = 5, + }; + + enum class MonochromePatternFormat : u32 { + CGA6_M1 = 0, + LE_M1 = 1, + }; + + union Regs { + static constexpr std::size_t NUM_REGS = 0x258; + struct { + u32 object; + INSERT_PADDING_WORDS_NOINIT(0x3F); + u32 no_operation; + NotifyType notify; + INSERT_PADDING_WORDS_NOINIT(0x2); + u32 wait_for_idle; + INSERT_PADDING_WORDS_NOINIT(0xB); + u32 pm_trigger; + INSERT_PADDING_WORDS_NOINIT(0xF); + u32 context_dma_notify; + u32 dst_context_dma; + u32 src_context_dma; + u32 semaphore_context_dma; + INSERT_PADDING_WORDS_NOINIT(0x1C); + Surface dst; + CpuIndexWrap pixels_from_cpu_index_wrap; + u32 kind2d_check_enable; + Surface src; + SectorPromotion pixels_from_memory_sector_promotion; + INSERT_PADDING_WORDS_NOINIT(0x1); + NumTpcs num_tpcs; + u32 render_enable_addr_upper; + u32 render_enable_addr_lower; + RenderEnableMode render_enable_mode; + INSERT_PADDING_WORDS_NOINIT(0x4); + u32 clip_x0; + u32 clip_y0; + u32 clip_width; + u32 clip_height; + BitField<0, 1, u32> clip_enable; + BitField<0, 3, ColorKeyFormat> color_key_format; + u32 color_key; + BitField<0, 1, u32> color_key_enable; + BitField<0, 8, u32> rop; + u32 beta1; + Beta4 beta4; + Operation operation; + union { + BitField<0, 6, u32> x; + BitField<8, 6, u32> y; + } pattern_offset; + BitField<0, 2, PatternSelect> pattern_select; + INSERT_PADDING_WORDS_NOINIT(0xC); + struct { + BitField<0, 3, MonochromePatternColorFormat> color_format; + BitField<0, 1, MonochromePatternFormat> format; + u32 color0; + u32 color1; + u32 pattern0; + u32 pattern1; + } monochrome_pattern; + struct { + std::array<u32, 0x40> X8R8G8B8; + std::array<u32, 0x20> R5G6B5; + std::array<u32, 0x20> X1R5G5B5; + std::array<u32, 0x10> Y8; + } color_pattern; + INSERT_PADDING_WORDS_NOINIT(0x10); + struct { + u32 prim_mode; + u32 prim_color_format; + u32 prim_color; + u32 line_tie_break_bits; + INSERT_PADDING_WORDS_NOINIT(0x14); + u32 prim_point_xy; + INSERT_PADDING_WORDS_NOINIT(0x7); + std::array<Point, 0x40> prim_point; + } render_solid; + struct { + u32 data_type; + u32 color_format; + u32 index_format; + u32 mono_format; + u32 wrap; + u32 color0; + u32 color1; + u32 mono_opacity; + INSERT_PADDING_WORDS_NOINIT(0x6); + u32 src_width; + u32 src_height; + u32 dx_du_frac; + u32 dx_du_int; + u32 dx_dv_frac; + u32 dy_dv_int; + u32 dst_x0_frac; + u32 dst_x0_int; + u32 dst_y0_frac; + u32 dst_y0_int; + u32 data; + } pixels_from_cpu; + INSERT_PADDING_WORDS_NOINIT(0x3); + u32 big_endian_control; + INSERT_PADDING_WORDS_NOINIT(0x3); + struct { + BitField<0, 3, u32> block_shape; + BitField<0, 5, u32> corral_size; + BitField<0, 1, u32> safe_overlap; union { - u32 raw; BitField<0, 1, Origin> origin; BitField<4, 1, Filter> filter; - } blit_control; - - INSERT_UNION_PADDING_WORDS(0x8); - - u32 blit_dst_x; - u32 blit_dst_y; - u32 blit_dst_width; - u32 blit_dst_height; - u64 blit_du_dx; - u64 blit_dv_dy; - u64 blit_src_x; - u64 blit_src_y; - - INSERT_UNION_PADDING_WORDS(0x21); - }; - std::array<u32, NUM_REGS> reg_array; + } sample_mode; + INSERT_PADDING_WORDS_NOINIT(0x8); + s32 dst_x0; + s32 dst_y0; + s32 dst_width; + s32 dst_height; + s64 du_dx; + s64 dv_dy; + s64 src_x0; + s64 src_y0; + } pixels_from_memory; }; + std::array<u32, NUM_REGS> reg_array; } regs{}; struct Config { Operation operation; Filter filter; - Common::Rectangle<u32> src_rect; - Common::Rectangle<u32> dst_rect; + s32 dst_x0; + s32 dst_y0; + s32 dst_x1; + s32 dst_y1; + s32 src_x0; + s32 src_y0; + s32 src_x1; + s32 src_y1; }; private: @@ -156,25 +303,49 @@ private: /// Performs the copy from the source surface to the destination surface as configured in the /// registers. - void HandleSurfaceCopy(); + void Blit(); }; #define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4, \ + static_assert(offsetof(Fermi2D::Regs, field_name) == position, \ "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(dst, 0x80); -ASSERT_REG_POSITION(src, 0x8C); -ASSERT_REG_POSITION(operation, 0xAB); -ASSERT_REG_POSITION(blit_control, 0x223); -ASSERT_REG_POSITION(blit_dst_x, 0x22c); -ASSERT_REG_POSITION(blit_dst_y, 0x22d); -ASSERT_REG_POSITION(blit_dst_width, 0x22e); -ASSERT_REG_POSITION(blit_dst_height, 0x22f); -ASSERT_REG_POSITION(blit_du_dx, 0x230); -ASSERT_REG_POSITION(blit_dv_dy, 0x232); -ASSERT_REG_POSITION(blit_src_x, 0x234); -ASSERT_REG_POSITION(blit_src_y, 0x236); +ASSERT_REG_POSITION(object, 0x0); +ASSERT_REG_POSITION(no_operation, 0x100); +ASSERT_REG_POSITION(notify, 0x104); +ASSERT_REG_POSITION(wait_for_idle, 0x110); +ASSERT_REG_POSITION(pm_trigger, 0x140); +ASSERT_REG_POSITION(context_dma_notify, 0x180); +ASSERT_REG_POSITION(dst_context_dma, 0x184); +ASSERT_REG_POSITION(src_context_dma, 0x188); +ASSERT_REG_POSITION(semaphore_context_dma, 0x18C); +ASSERT_REG_POSITION(dst, 0x200); +ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228); +ASSERT_REG_POSITION(kind2d_check_enable, 0x22C); +ASSERT_REG_POSITION(src, 0x230); +ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258); +ASSERT_REG_POSITION(num_tpcs, 0x260); +ASSERT_REG_POSITION(render_enable_addr_upper, 0x264); +ASSERT_REG_POSITION(render_enable_addr_lower, 0x268); +ASSERT_REG_POSITION(clip_x0, 0x280); +ASSERT_REG_POSITION(clip_y0, 0x284); +ASSERT_REG_POSITION(clip_width, 0x288); +ASSERT_REG_POSITION(clip_height, 0x28c); +ASSERT_REG_POSITION(clip_enable, 0x290); +ASSERT_REG_POSITION(color_key_format, 0x294); +ASSERT_REG_POSITION(color_key, 0x298); +ASSERT_REG_POSITION(rop, 0x2A0); +ASSERT_REG_POSITION(beta1, 0x2A4); +ASSERT_REG_POSITION(beta4, 0x2A8); +ASSERT_REG_POSITION(operation, 0x2AC); +ASSERT_REG_POSITION(pattern_offset, 0x2B0); +ASSERT_REG_POSITION(pattern_select, 0x2B4); +ASSERT_REG_POSITION(monochrome_pattern, 0x2E8); +ASSERT_REG_POSITION(color_pattern, 0x300); +ASSERT_REG_POSITION(render_solid, 0x580); +ASSERT_REG_POSITION(pixels_from_cpu, 0x800); +ASSERT_REG_POSITION(big_endian_control, 0x870); +ASSERT_REG_POSITION(pixels_from_memory, 0x880); #undef ASSERT_REG_POSITION diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 898370739..a9b75091e 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -21,8 +21,8 @@ KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manage KeplerCompute::~KeplerCompute() = default; -void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) { @@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal case KEPLER_COMPUTE_REG_INDEX(data_upload): { upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().OnMemoryWrite(); } break; } @@ -58,24 +57,6 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun } } -Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const { - const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value(); - ASSERT(cbuf_mask[regs.tex_cb_index]); - - const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index]; - ASSERT(texinfo.Address() != 0); - - const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle); - ASSERT(address < texinfo.Address() + texinfo.size); - - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)}; - return GetTextureInfo(tex_handle); -} - -Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const { - return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)}; -} - u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const { ASSERT(stage == ShaderType::Compute); const auto& buffer = launch_description.const_buffer_config[const_buffer]; @@ -98,9 +79,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { const Texture::TextureHandle tex_handle{handle}; - const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); - result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); + const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id); + const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id); + + SamplerDescriptor result = SamplerDescriptor::FromTIC(tic); + result.is_shadow.Assign(tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 7f2500aab..7c40cba38 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -46,7 +46,7 @@ public: ~KeplerCompute(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); static constexpr std::size_t NumConstBuffers = 8; @@ -55,7 +55,7 @@ public: union { struct { - INSERT_UNION_PADDING_WORDS(0x60); + INSERT_PADDING_WORDS_NOINIT(0x60); Upload::Registers upload; @@ -67,7 +67,7 @@ public: u32 data_upload; - INSERT_UNION_PADDING_WORDS(0x3F); + INSERT_PADDING_WORDS_NOINIT(0x3F); struct { u32 address; @@ -76,11 +76,11 @@ public: } } launch_desc_loc; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); u32 launch; - INSERT_UNION_PADDING_WORDS(0x4A7); + INSERT_PADDING_WORDS_NOINIT(0x4A7); struct { u32 address_high; @@ -92,7 +92,7 @@ public: } } tsc; - INSERT_UNION_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS_NOINIT(0x3); struct { u32 address_high; @@ -104,7 +104,7 @@ public: } } tic; - INSERT_UNION_PADDING_WORDS(0x22); + INSERT_PADDING_WORDS_NOINIT(0x22); struct { u32 address_high; @@ -115,11 +115,11 @@ public: } } code_loc; - INSERT_UNION_PADDING_WORDS(0x3FE); + INSERT_PADDING_WORDS_NOINIT(0x3FE); u32 tex_cb_index; - INSERT_UNION_PADDING_WORDS(0x374); + INSERT_PADDING_WORDS_NOINIT(0x374); }; std::array<u32, NUM_REGS> reg_array; }; @@ -209,11 +209,6 @@ public: void CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) override; - Texture::FullTextureInfo GetTexture(std::size_t offset) const; - - /// Given a texture handle, returns the TSC and TIC entries. - Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const; - u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override; SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index dc71b2eec..560551157 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -14,8 +14,8 @@ namespace Tegra::Engines { -KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) - : system{system}, upload_state{memory_manager, regs.upload} {} +KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager) + : system{system_}, upload_state{memory_manager, regs.upload} {} KeplerMemory::~KeplerMemory() = default; @@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call case KEPLERMEMORY_REG_INDEX(data): { upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 5b7f71a00..19808a5c6 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -35,7 +35,7 @@ namespace Tegra::Engines { class KeplerMemory final : public EngineInterface { public: - KeplerMemory(Core::System& system, MemoryManager& memory_manager); + explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager); ~KeplerMemory(); /// Write the value to the register identified by method. @@ -50,7 +50,7 @@ public: union { struct { - INSERT_UNION_PADDING_WORDS(0x60); + INSERT_PADDING_WORDS_NOINIT(0x60); Upload::Registers upload; @@ -62,7 +62,7 @@ public: u32 data; - INSERT_UNION_PADDING_WORDS(0x11); + INSERT_PADDING_WORDS_NOINIT(0x11); }; std::array<u32, NUM_REGS> reg_array; }; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 33854445f..75517a4f7 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <cinttypes> #include <cstring> #include <optional> #include "common/assert.h" @@ -31,8 +30,8 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_) Maxwell3D::~Maxwell3D() = default; -void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void Maxwell3D::InitializeRegisterDefaults() { @@ -124,6 +123,115 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } +void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) { + if (executing_macro == 0) { + // A macro call must begin by writing the macro method's register, not its argument. + ASSERT_MSG((method % 2) == 0, + "Can't start macro execution by writing to the ARGS register"); + executing_macro = method; + } + + macro_params.insert(macro_params.end(), base_start, base_start + amount); + + // Call the macro when there are no more parameters in the command buffer + if (is_last_call) { + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); + } +} + +u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) { + // Keep track of the register value in shadow_state when requested. + const auto control = shadow_state.shadow_ram_control; + if (control == Regs::ShadowRamControl::Track || + control == Regs::ShadowRamControl::TrackWithFilter) { + shadow_state.reg_array[method] = argument; + return argument; + } + if (control == Regs::ShadowRamControl::Replay) { + return shadow_state.reg_array[method]; + } + return argument; +} + +void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) { + if (regs.reg_array[method] == argument) { + return; + } + regs.reg_array[method] = argument; + + for (const auto& table : dirty.tables) { + dirty.flags[table[method]] = true; + } +} + +void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, + bool is_last_call) { + switch (method) { + case MAXWELL3D_REG_INDEX(wait_for_idle): + return rasterizer->WaitForIdle(); + case MAXWELL3D_REG_INDEX(shadow_ram_control): + shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument); + return; + case MAXWELL3D_REG_INDEX(macros.data): + return macro_engine->AddCode(regs.macros.upload_address, argument); + case MAXWELL3D_REG_INDEX(macros.bind): + return ProcessMacroBind(argument); + case MAXWELL3D_REG_INDEX(firmware[4]): + return ProcessFirmwareCall4(); + case MAXWELL3D_REG_INDEX(const_buffer.cb_data): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 1: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 2: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 3: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 4: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 5: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 6: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 7: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 8: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 9: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 10: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 11: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 12: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 13: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 14: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: + return StartCBData(method); + case MAXWELL3D_REG_INDEX(cb_bind[0]): + return ProcessCBBind(0); + case MAXWELL3D_REG_INDEX(cb_bind[1]): + return ProcessCBBind(1); + case MAXWELL3D_REG_INDEX(cb_bind[2]): + return ProcessCBBind(2); + case MAXWELL3D_REG_INDEX(cb_bind[3]): + return ProcessCBBind(3); + case MAXWELL3D_REG_INDEX(cb_bind[4]): + return ProcessCBBind(4); + case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): + return DrawArrays(); + case MAXWELL3D_REG_INDEX(clear_buffers): + return ProcessClearBuffers(); + case MAXWELL3D_REG_INDEX(query.query_get): + return ProcessQueryGet(); + case MAXWELL3D_REG_INDEX(condition.mode): + return ProcessQueryCondition(); + case MAXWELL3D_REG_INDEX(counter_reset): + return ProcessCounterReset(); + case MAXWELL3D_REG_INDEX(sync_info): + return ProcessSyncPoint(); + case MAXWELL3D_REG_INDEX(exec_upload): + return upload_state.ProcessExec(regs.exec_upload.linear != 0); + case MAXWELL3D_REG_INDEX(data_upload): + upload_state.ProcessData(argument, is_last_call); + if (is_last_call) { + } + return; + case MAXWELL3D_REG_INDEX(fragment_barrier): + return rasterizer->FragmentBarrier(); + case MAXWELL3D_REG_INDEX(tiled_cache_barrier): + return rasterizer->TiledCacheBarrier(); + } +} + void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) { // Reset the current macro. executing_macro = 0; @@ -157,142 +265,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. if (method >= MacroRegistersStart) { - // We're trying to execute a macro - if (executing_macro == 0) { - // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, - "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; - } - - macro_params.push_back(method_argument); - - // Call the macro when there are no more parameters in the command buffer - if (is_last_call) { - CallMacroMethod(executing_macro, macro_params); - macro_params.clear(); - } + ProcessMacro(method, &method_argument, 1, is_last_call); return; } ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); - u32 arg = method_argument; - // Keep track of the register value in shadow_state when requested. - if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track || - shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) { - shadow_state.reg_array[method] = arg; - } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) { - arg = shadow_state.reg_array[method]; - } - - if (regs.reg_array[method] != arg) { - regs.reg_array[method] = arg; - - for (const auto& table : dirty.tables) { - dirty.flags[table[method]] = true; - } - } - - switch (method) { - case MAXWELL3D_REG_INDEX(wait_for_idle): { - rasterizer->WaitForIdle(); - break; - } - case MAXWELL3D_REG_INDEX(shadow_ram_control): { - shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument); - break; - } - case MAXWELL3D_REG_INDEX(macros.data): { - macro_engine->AddCode(regs.macros.upload_address, arg); - break; - } - case MAXWELL3D_REG_INDEX(macros.bind): { - ProcessMacroBind(arg); - break; - } - case MAXWELL3D_REG_INDEX(firmware[4]): { - ProcessFirmwareCall4(); - break; - } - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - StartCBData(method); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[0]): { - ProcessCBBind(0); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[1]): { - ProcessCBBind(1); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[2]): { - ProcessCBBind(2); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[3]): { - ProcessCBBind(3); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[4]): { - ProcessCBBind(4); - break; - } - case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): { - DrawArrays(); - break; - } - case MAXWELL3D_REG_INDEX(clear_buffers): { - ProcessClearBuffers(); - break; - } - case MAXWELL3D_REG_INDEX(query.query_get): { - ProcessQueryGet(); - break; - } - case MAXWELL3D_REG_INDEX(condition.mode): { - ProcessQueryCondition(); - break; - } - case MAXWELL3D_REG_INDEX(counter_reset): { - ProcessCounterReset(); - break; - } - case MAXWELL3D_REG_INDEX(sync_info): { - ProcessSyncPoint(); - break; - } - case MAXWELL3D_REG_INDEX(exec_upload): { - upload_state.ProcessExec(regs.exec_upload.linear != 0); - break; - } - case MAXWELL3D_REG_INDEX(data_upload): { - upload_state.ProcessData(arg, is_last_call); - if (is_last_call) { - OnMemoryWrite(); - } - break; - } - default: - break; - } + const u32 argument = ProcessShadowRam(method, method_argument); + ProcessDirtyRegisters(method, argument); + ProcessMethodCall(method, argument, method_argument, is_last_call); } void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, @@ -300,50 +282,33 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. if (method >= MacroRegistersStart) { - // We're trying to execute a macro - if (executing_macro == 0) { - // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, - "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; - } - - for (std::size_t i = 0; i < amount; i++) { - macro_params.push_back(base_start[i]); - } - - // Call the macro when there are no more parameters in the command buffer - if (amount == methods_pending) { - CallMacroMethod(executing_macro, macro_params); - macro_params.clear(); - } + ProcessMacro(method, base_start, amount, amount == methods_pending); return; } switch (method) { - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { + case MAXWELL3D_REG_INDEX(const_buffer.cb_data): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 1: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 2: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 3: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 4: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 5: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 6: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 7: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 8: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 9: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 10: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 11: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 12: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 13: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 14: + case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: ProcessCBMultiData(method, base_start, amount); break; - } - default: { + default: for (std::size_t i = 0; i < amount; i++) { CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); } - } + break; } } @@ -396,7 +361,7 @@ void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) { } void Maxwell3D::FlushMMEInlineDraw() { - LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); ASSERT(mme_draw.instance_count == mme_draw.gl_end_count); @@ -541,8 +506,7 @@ void Maxwell3D::ProcessCounterReset() { rasterizer->ResetCounter(QueryType::SamplesPassed); break; default: - LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", - static_cast<int>(regs.counter_reset)); + LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset); break; } } @@ -557,7 +521,7 @@ void Maxwell3D::ProcessSyncPoint() { } void Maxwell3D::DrawArrays() { - LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); @@ -595,27 +559,28 @@ std::optional<u64> Maxwell3D::GetQueryResult() { return 0; case Regs::QuerySelect::SamplesPassed: // Deferred. - rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed, system.GPU().GetTicks()); - return {}; + return std::nullopt; default: LOG_DEBUG(HW_GPU, "Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); + regs.query.query_get.select.Value()); return 1; } } -void Maxwell3D::ProcessCBBind(std::size_t stage_index) { +void Maxwell3D::ProcessCBBind(size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. - auto& shader = state.shader_stages[stage_index]; - auto& bind_data = regs.cb_bind[stage_index]; - - ASSERT(bind_data.index < Regs::MaxConstBuffers); - auto& buffer = shader.const_buffers[bind_data.index]; - + const auto& bind_data = regs.cb_bind[stage_index]; + auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index]; buffer.enabled = bind_data.valid.Value() != 0; buffer.address = regs.const_buffer.BufferAddress(); buffer.size = regs.const_buffer.cb_size; + + const bool is_enabled = bind_data.valid.Value() != 0; + const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0; + const u32 size = is_enabled ? regs.const_buffer.cb_size : 0; + rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size); } void Maxwell3D::ProcessCBData(u32 value) { @@ -627,7 +592,7 @@ void Maxwell3D::ProcessCBData(u32 value) { } void Maxwell3D::StartCBData(u32 method) { - constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]); + constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data); cb_data_state.start_pos = regs.const_buffer.cb_pos; cb_data_state.id = method - first_cb_data; cb_data_state.current = method; @@ -640,7 +605,7 @@ void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount if (cb_data_state.current != null_cb_data) { FinishCBData(); } - constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]); + constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data); cb_data_state.start_pos = regs.const_buffer.cb_pos; cb_data_state.id = method - first_cb_data; cb_data_state.current = method; @@ -670,14 +635,13 @@ void Maxwell3D::FinishCBData() { const u32 id = cb_data_state.id; memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); - OnMemoryWrite(); cb_data_state.id = null_cb_data; cb_data_state.current = null_cb_data; } Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { - const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)}; + const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)}; Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); @@ -686,43 +650,19 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { } Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { - const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)}; + const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)}; Texture::TSCEntry tsc_entry; memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); return tsc_entry; } -Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const { - return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)}; -} - -Texture::FullTextureInfo Maxwell3D::GetStageTexture(ShaderType stage, std::size_t offset) const { - const auto stage_index = static_cast<std::size_t>(stage); - const auto& shader = state.shader_stages[stage_index]; - const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index]; - ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0); - - const GPUVAddr tex_info_address = - tex_info_buffer.address + offset * sizeof(Texture::TextureHandle); - - ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size); - - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; - - return GetTextureInfo(tex_handle); -} - u32 Maxwell3D::GetRegisterValue(u32 method) const { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register"); return regs.reg_array[method]; } void Maxwell3D::ProcessClearBuffers() { - ASSERT(regs.clear_buffers.R == regs.clear_buffers.G && - regs.clear_buffers.R == regs.clear_buffers.B && - regs.clear_buffers.R == regs.clear_buffers.A); - rasterizer->Clear(); } @@ -730,9 +670,7 @@ u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offse ASSERT(stage != ShaderType::Compute); const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)]; const auto& buffer = shader_stage.const_buffers[const_buffer]; - u32 result; - std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32)); - return result; + return memory_manager.Read<u32>(buffer.address + offset); } SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const { @@ -750,9 +688,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { const Texture::TextureHandle tex_handle{handle}; - const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); - result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); + const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id); + const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id); + + SamplerDescriptor result = SamplerDescriptor::FromTIC(tic); + result.is_shadow.Assign(tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index bc289c55d..ffed42a29 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -55,7 +55,7 @@ public: ~Maxwell3D(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); /// Register structure of the Maxwell3D engine. /// TODO(Subv): This structure will need to be made bigger as more registers are discovered. @@ -438,16 +438,6 @@ public: DecrWrapOGL = 0x8508, }; - enum class MemoryLayout : u32 { - Linear = 0, - BlockLinear = 1, - }; - - enum class InvMemoryLayout : u32 { - BlockLinear = 0, - Linear = 1, - }; - enum class CounterReset : u32 { SampleCnt = 0x01, Unk02 = 0x02, @@ -546,7 +536,7 @@ public: Equation equation_a; Factor factor_source_a; Factor factor_dest_a; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); }; enum class TessellationPrimitive : u32 { @@ -589,26 +579,36 @@ public: NegativeW = 7, }; + enum class SamplerIndex : u32 { + Independently = 0, + ViaHeaderIndex = 1, + }; + + struct TileMode { + union { + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + BitField<12, 1, u32> is_pitch_linear; + BitField<16, 1, u32> is_3d; + }; + }; + static_assert(sizeof(TileMode) == 4); + struct RenderTargetConfig { u32 address_high; u32 address_low; u32 width; u32 height; Tegra::RenderTargetFormat format; + TileMode tile_mode; union { - BitField<0, 3, u32> block_width; - BitField<4, 3, u32> block_height; - BitField<8, 3, u32> block_depth; - BitField<12, 1, InvMemoryLayout> type; - BitField<16, 1, u32> is_3d; - } memory_layout; - union { - BitField<0, 16, u32> layers; + BitField<0, 16, u32> depth; BitField<16, 1, u32> volume; }; u32 layer_stride; u32 base_layer; - INSERT_UNION_PADDING_WORDS(7); + INSERT_PADDING_WORDS_NOINIT(7); GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | @@ -640,7 +640,7 @@ public: BitField<8, 3, ViewportSwizzle> z; BitField<12, 3, ViewportSwizzle> w; } swizzle; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); Common::Rectangle<f32> GetRect() const { return { @@ -700,7 +700,7 @@ public: u32 address_low; s32 buffer_size; s32 buffer_offset; - INSERT_UNION_PADDING_WORDS(3); + INSERT_PADDING_WORDS_NOINIT(3); GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | @@ -713,7 +713,7 @@ public: u32 stream; u32 varying_count; u32 stride; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); }; static_assert(sizeof(TransformFeedbackLayout) == 16); @@ -731,7 +731,7 @@ public: union { struct { - INSERT_UNION_PADDING_WORDS(0x44); + INSERT_PADDING_WORDS_NOINIT(0x44); u32 wait_for_idle; @@ -744,7 +744,7 @@ public: ShadowRamControl shadow_ram_control; - INSERT_UNION_PADDING_WORDS(0x16); + INSERT_PADDING_WORDS_NOINIT(0x16); Upload::Registers upload; struct { @@ -755,7 +755,11 @@ public: u32 data_upload; - INSERT_UNION_PADDING_WORDS(0x44); + INSERT_PADDING_WORDS_NOINIT(0x16); + + u32 force_early_fragment_tests; + + INSERT_PADDING_WORDS_NOINIT(0x2D); struct { union { @@ -765,7 +769,7 @@ public: }; } sync_info; - INSERT_UNION_PADDING_WORDS(0x15); + INSERT_PADDING_WORDS_NOINIT(0x15); union { BitField<0, 2, TessellationPrimitive> prim; @@ -777,21 +781,21 @@ public: std::array<f32, 4> tess_level_outer; std::array<f32, 2> tess_level_inner; - INSERT_UNION_PADDING_WORDS(0x10); + INSERT_PADDING_WORDS_NOINIT(0x10); u32 rasterize_enable; std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings; - INSERT_UNION_PADDING_WORDS(0xC0); + INSERT_PADDING_WORDS_NOINIT(0xC0); std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); u32 tfb_enabled; - INSERT_UNION_PADDING_WORDS(0x2E); + INSERT_PADDING_WORDS_NOINIT(0x2E); std::array<RenderTargetConfig, NumRenderTargets> rt; @@ -799,7 +803,7 @@ public: std::array<ViewPort, NumViewports> viewports; - INSERT_UNION_PADDING_WORDS(0x1D); + INSERT_PADDING_WORDS_NOINIT(0x1D); struct { u32 first; @@ -811,16 +815,16 @@ public: float clear_color[4]; float clear_depth; - INSERT_UNION_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS_NOINIT(0x3); s32 clear_stencil; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); PolygonMode polygon_mode_front; PolygonMode polygon_mode_back; - INSERT_UNION_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS_NOINIT(0x3); u32 polygon_offset_point_enable; u32 polygon_offset_line_enable; @@ -828,46 +832,53 @@ public: u32 patch_vertices; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_PADDING_WORDS_NOINIT(0x4); + + u32 fragment_barrier; + + INSERT_PADDING_WORDS_NOINIT(0x7); std::array<ScissorTest, NumViewports> scissor_test; - INSERT_UNION_PADDING_WORDS(0x15); + INSERT_PADDING_WORDS_NOINIT(0x15); s32 stencil_back_func_ref; u32 stencil_back_mask; u32 stencil_back_func_mask; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_PADDING_WORDS_NOINIT(0x5); + + u32 invalidate_texture_data_cache; + + INSERT_PADDING_WORDS_NOINIT(0x1); + + u32 tiled_cache_barrier; + + INSERT_PADDING_WORDS_NOINIT(0x4); u32 color_mask_common; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); f32 depth_bounds[2]; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); u32 rt_separate_frag_data; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); u32 multisample_raster_enable; u32 multisample_raster_samples; std::array<u32, 4> multisample_sample_mask; - INSERT_UNION_PADDING_WORDS(0x5); + INSERT_PADDING_WORDS_NOINIT(0x5); struct { u32 address_high; u32 address_low; Tegra::DepthFormat format; - union { - BitField<0, 4, u32> block_width; - BitField<4, 4, u32> block_height; - BitField<8, 4, u32> block_depth; - BitField<20, 1, InvMemoryLayout> type; - } memory_layout; + TileMode tile_mode; u32 layer_stride; GPUVAddr Address() const { @@ -876,7 +887,18 @@ public: } } zeta; - INSERT_UNION_PADDING_WORDS(0x41); + struct { + union { + BitField<0, 16, u32> x; + BitField<16, 16, u32> width; + }; + union { + BitField<0, 16, u32> y; + BitField<16, 16, u32> height; + }; + } render_area; + + INSERT_PADDING_WORDS_NOINIT(0x3F); union { BitField<0, 4, u32> stencil; @@ -885,24 +907,24 @@ public: BitField<12, 4, u32> viewport; } clear_flags; - INSERT_UNION_PADDING_WORDS(0x10); + INSERT_PADDING_WORDS_NOINIT(0x10); u32 fill_rectangle; - INSERT_UNION_PADDING_WORDS(0x8); + INSERT_PADDING_WORDS_NOINIT(0x8); std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; std::array<MsaaSampleLocation, 4> multisample_sample_locations; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); union { BitField<0, 1, u32> enable; BitField<4, 3, u32> target; } multisample_coverage_to_color; - INSERT_UNION_PADDING_WORDS(0x8); + INSERT_PADDING_WORDS_NOINIT(0x8); struct { union { @@ -917,7 +939,7 @@ public: BitField<25, 3, u32> map_7; }; - u32 GetMap(std::size_t index) const { + u32 Map(std::size_t index) const { const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3, map_4, map_5, map_6, map_7}; ASSERT(index < maps.size()); @@ -925,20 +947,22 @@ public: } } rt_control; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); u32 zeta_width; u32 zeta_height; union { - BitField<0, 16, u32> zeta_layers; + BitField<0, 16, u32> zeta_depth; BitField<16, 1, u32> zeta_volume; }; - INSERT_UNION_PADDING_WORDS(0x26); + SamplerIndex sampler_index; + + INSERT_PADDING_WORDS_NOINIT(0x25); u32 depth_test_enable; - INSERT_UNION_PADDING_WORDS(0x5); + INSERT_PADDING_WORDS_NOINIT(0x5); u32 independent_blend_enable; @@ -946,7 +970,7 @@ public: u32 alpha_test_enabled; - INSERT_UNION_PADDING_WORDS(0x6); + INSERT_PADDING_WORDS_NOINIT(0x6); u32 d3d_cull_mode; @@ -960,7 +984,8 @@ public: float b; float a; } blend_color; - INSERT_UNION_PADDING_WORDS(0x4); + + INSERT_PADDING_WORDS_NOINIT(0x4); struct { u32 separate_alpha; @@ -969,7 +994,7 @@ public: Blend::Factor factor_dest_rgb; Blend::Equation equation_a; Blend::Factor factor_source_a; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); Blend::Factor factor_dest_a; u32 enable_common; @@ -985,7 +1010,7 @@ public: u32 stencil_front_func_mask; u32 stencil_front_mask; - INSERT_UNION_PADDING_WORDS(0x2); + INSERT_PADDING_WORDS_NOINIT(0x2); u32 frag_color_clamp; @@ -997,12 +1022,17 @@ public: float line_width_smooth; float line_width_aliased; - INSERT_UNION_PADDING_WORDS(0x1F); + INSERT_PADDING_WORDS_NOINIT(0x1B); + + u32 invalidate_sampler_cache_no_wfi; + u32 invalidate_texture_header_cache_no_wfi; + + INSERT_PADDING_WORDS_NOINIT(0x2); u32 vb_element_base; u32 vb_base_instance; - INSERT_UNION_PADDING_WORDS(0x35); + INSERT_PADDING_WORDS_NOINIT(0x35); u32 clip_distance_enabled; @@ -1010,11 +1040,11 @@ public: float point_size; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); u32 point_sprite_enable; - INSERT_UNION_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS_NOINIT(0x3); CounterReset counter_reset; @@ -1027,7 +1057,7 @@ public: BitField<4, 1, u32> alpha_to_one; } multisample_control; - INSERT_UNION_PADDING_WORDS(0x4); + INSERT_PADDING_WORDS_NOINIT(0x4); struct { u32 address_high; @@ -1041,34 +1071,34 @@ public: } condition; struct { - u32 tsc_address_high; - u32 tsc_address_low; - u32 tsc_limit; + u32 address_high; + u32 address_low; + u32 limit; - GPUVAddr TSCAddress() const { - return static_cast<GPUVAddr>( - (static_cast<GPUVAddr>(tsc_address_high) << 32) | tsc_address_low); + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); } } tsc; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); float polygon_offset_factor; u32 line_smooth_enable; struct { - u32 tic_address_high; - u32 tic_address_low; - u32 tic_limit; + u32 address_high; + u32 address_low; + u32 limit; - GPUVAddr TICAddress() const { - return static_cast<GPUVAddr>( - (static_cast<GPUVAddr>(tic_address_high) << 32) | tic_address_low); + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); } } tic; - INSERT_UNION_PADDING_WORDS(0x5); + INSERT_PADDING_WORDS_NOINIT(0x5); u32 stencil_two_side_enable; StencilOp stencil_back_op_fail; @@ -1076,17 +1106,17 @@ public: StencilOp stencil_back_op_zpass; ComparisonOp stencil_back_func_func; - INSERT_UNION_PADDING_WORDS(0x4); + INSERT_PADDING_WORDS_NOINIT(0x4); u32 framebuffer_srgb; float polygon_offset_units; - INSERT_UNION_PADDING_WORDS(0x4); + INSERT_PADDING_WORDS_NOINIT(0x4); Tegra::Texture::MsaaMode multisample_mode; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_PADDING_WORDS_NOINIT(0xC); union { BitField<2, 1, u32> coord_origin; @@ -1102,7 +1132,7 @@ public: (static_cast<GPUVAddr>(code_address_high) << 32) | code_address_low); } } code_address; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); struct { u32 vertex_end_gl; @@ -1114,14 +1144,14 @@ public: }; } draw; - INSERT_UNION_PADDING_WORDS(0xA); + INSERT_PADDING_WORDS_NOINIT(0xA); struct { u32 enabled; u32 index; } primitive_restart; - INSERT_UNION_PADDING_WORDS(0x5F); + INSERT_PADDING_WORDS_NOINIT(0x5F); struct { u32 start_addr_high; @@ -1162,9 +1192,9 @@ public: } } index_array; - INSERT_UNION_PADDING_WORDS(0x7); + INSERT_PADDING_WORDS_NOINIT(0x7); - INSERT_UNION_PADDING_WORDS(0x1F); + INSERT_PADDING_WORDS_NOINIT(0x1F); float polygon_offset_clamp; @@ -1178,14 +1208,14 @@ public: } } instanced_arrays; - INSERT_UNION_PADDING_WORDS(0x4); + INSERT_PADDING_WORDS_NOINIT(0x4); union { BitField<0, 1, u32> enable; BitField<4, 8, u32> unk4; } vp_point_size; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); u32 cull_test_enabled; FrontFace front_face; @@ -1193,11 +1223,11 @@ public: u32 pixel_center_integer; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); u32 viewport_transform_enabled; - INSERT_UNION_PADDING_WORDS(0x3); + INSERT_PADDING_WORDS_NOINIT(0x3); union { BitField<0, 1, u32> depth_range_0_1; @@ -1206,18 +1236,18 @@ public: BitField<11, 1, u32> depth_clamp_disabled; } view_volume_clip_control; - INSERT_UNION_PADDING_WORDS(0x1F); + INSERT_PADDING_WORDS_NOINIT(0x1F); u32 depth_bounds_enable; - INSERT_UNION_PADDING_WORDS(1); + INSERT_PADDING_WORDS_NOINIT(1); struct { u32 enable; LogicOperation operation; } logic_op; - INSERT_UNION_PADDING_WORDS(0x1); + INSERT_PADDING_WORDS_NOINIT(0x1); union { u32 raw; @@ -1230,9 +1260,9 @@ public: BitField<6, 4, u32> RT; BitField<10, 11, u32> layer; } clear_buffers; - INSERT_UNION_PADDING_WORDS(0xB); + INSERT_PADDING_WORDS_NOINIT(0xB); std::array<ColorMask, NumRenderTargets> color_mask; - INSERT_UNION_PADDING_WORDS(0x38); + INSERT_PADDING_WORDS_NOINIT(0x38); struct { u32 query_address_high; @@ -1254,7 +1284,7 @@ public: } } query; - INSERT_UNION_PADDING_WORDS(0x3C); + INSERT_PADDING_WORDS_NOINIT(0x3C); struct { union { @@ -1284,8 +1314,7 @@ public: GPUVAddr LimitAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) | - limit_low) + - 1; + limit_low); } } vertex_array_limit[NumVertexArrays]; @@ -1295,10 +1324,10 @@ public: BitField<4, 4, ShaderProgram> program; }; u32 offset; - INSERT_UNION_PADDING_WORDS(14); + INSERT_PADDING_WORDS_NOINIT(14); } shader_config[MaxShaderProgram]; - INSERT_UNION_PADDING_WORDS(0x60); + INSERT_PADDING_WORDS_NOINIT(0x60); u32 firmware[0x20]; @@ -1307,7 +1336,7 @@ public: u32 cb_address_high; u32 cb_address_low; u32 cb_pos; - u32 cb_data[NumCBData]; + std::array<u32, NumCBData> cb_data; GPUVAddr BufferAddress() const { return static_cast<GPUVAddr>( @@ -1315,7 +1344,7 @@ public: } } const_buffer; - INSERT_UNION_PADDING_WORDS(0x10); + INSERT_PADDING_WORDS_NOINIT(0x10); struct { union { @@ -1323,18 +1352,18 @@ public: BitField<0, 1, u32> valid; BitField<4, 5, u32> index; }; - INSERT_UNION_PADDING_WORDS(7); + INSERT_PADDING_WORDS_NOINIT(7); } cb_bind[MaxShaderStage]; - INSERT_UNION_PADDING_WORDS(0x56); + INSERT_PADDING_WORDS_NOINIT(0x56); u32 tex_cb_index; - INSERT_UNION_PADDING_WORDS(0x7D); + INSERT_PADDING_WORDS_NOINIT(0x7D); std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs; - INSERT_UNION_PADDING_WORDS(0x298); + INSERT_PADDING_WORDS_NOINIT(0x298); struct { /// Compressed address of a buffer that holds information about bound SSBOs. @@ -1346,14 +1375,14 @@ public: } } ssbo_info; - INSERT_UNION_PADDING_WORDS(0x11); + INSERT_PADDING_WORDS_NOINIT(0x11); struct { u32 address[MaxShaderStage]; u32 size[MaxShaderStage]; } tex_info_buffers; - INSERT_UNION_PADDING_WORDS(0xCC); + INSERT_PADDING_WORDS_NOINIT(0xCC); }; std::array<u32, NUM_REGS> reg_array; }; @@ -1373,6 +1402,7 @@ public: }; std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages; + u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering. }; @@ -1393,12 +1423,6 @@ public: void FlushMMEInlineDraw(); - /// Given a texture handle, returns the TSC and TIC entries. - Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const; - - /// Returns the texture information for a specific texture in a specific shader stage. - Texture::FullTextureInfo GetStageTexture(ShaderType stage, std::size_t offset) const; - u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override; SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override; @@ -1428,11 +1452,6 @@ public: return *rasterizer; } - /// Notify a memory write has happened. - void OnMemoryWrite() { - dirty.flags |= dirty.on_write_stores; - } - enum class MMEDrawMode : u32 { Undefined, Array, @@ -1454,45 +1473,19 @@ public: using Tables = std::array<Table, 2>; Flags flags; - Flags on_write_stores; Tables tables{}; } dirty; private: void InitializeRegisterDefaults(); - Core::System& system; - MemoryManager& memory_manager; + void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call); - VideoCore::RasterizerInterface* rasterizer = nullptr; + u32 ProcessShadowRam(u32 method, u32 argument); - /// Start offsets of each macro in macro_memory - std::array<u32, 0x80> macro_positions = {}; - - std::array<bool, Regs::NUM_REGS> mme_inline{}; - - /// Macro method that is currently being executed / being fed parameters. - u32 executing_macro = 0; - /// Parameters that have been submitted to the macro call so far. - std::vector<u32> macro_params; - - /// Interpreter for the macro codes uploaded to the GPU. - std::unique_ptr<MacroEngine> macro_engine; - - static constexpr u32 null_cb_data = 0xFFFFFFFF; - struct { - std::array<std::array<u32, 0x4000>, 16> buffer; - u32 current{null_cb_data}; - u32 id{null_cb_data}; - u32 start_pos{}; - u32 counter{}; - } cb_data_state; + void ProcessDirtyRegisters(u32 method, u32 argument); - Upload::State upload_state; - - bool execute_on{true}; - - std::array<u8, Regs::NUM_REGS> dirty_pointers{}; + void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call); /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1502,8 +1495,8 @@ private: /** * Call a macro on this engine. + * * @param method Method to call - * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ void CallMacroMethod(u32 method, const std::vector<u32>& parameters); @@ -1542,7 +1535,7 @@ private: void FinishCBData(); /// Handles a write to the CB_BIND register. - void ProcessCBBind(std::size_t stage_index); + void ProcessCBBind(size_t stage_index); /// Handles a write to the VERTEX_END_GL register, triggering a draw. void DrawArrays(); @@ -1552,6 +1545,38 @@ private: /// Returns a query's value or an empty object if the value will be deferred through a cache. std::optional<u64> GetQueryResult(); + + Core::System& system; + MemoryManager& memory_manager; + + VideoCore::RasterizerInterface* rasterizer = nullptr; + + /// Start offsets of each macro in macro_memory + std::array<u32, 0x80> macro_positions{}; + + std::array<bool, Regs::NUM_REGS> mme_inline{}; + + /// Macro method that is currently being executed / being fed parameters. + u32 executing_macro = 0; + /// Parameters that have been submitted to the macro call so far. + std::vector<u32> macro_params; + + /// Interpreter for the macro codes uploaded to the GPU. + std::unique_ptr<MacroEngine> macro_engine; + + static constexpr u32 null_cb_data = 0xFFFFFFFF; + struct CBDataState { + std::array<std::array<u32, 0x4000>, 16> buffer; + u32 current{null_cb_data}; + u32 id{null_cb_data}; + u32 start_pos{}; + u32 counter{}; + }; + CBDataState cb_data_state; + + Upload::State upload_state; + + bool execute_on{true}; }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -1564,6 +1589,7 @@ ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(upload, 0x60); ASSERT_REG_POSITION(exec_upload, 0x6C); ASSERT_REG_POSITION(data_upload, 0x6D); +ASSERT_REG_POSITION(force_early_fragment_tests, 0x84); ASSERT_REG_POSITION(sync_info, 0xB2); ASSERT_REG_POSITION(tess_mode, 0xC8); ASSERT_REG_POSITION(tess_level_outer, 0xC9); @@ -1586,10 +1612,13 @@ ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); ASSERT_REG_POSITION(patch_vertices, 0x373); +ASSERT_REG_POSITION(fragment_barrier, 0x378); ASSERT_REG_POSITION(scissor_test, 0x380); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); +ASSERT_REG_POSITION(invalidate_texture_data_cache, 0x3DD); +ASSERT_REG_POSITION(tiled_cache_barrier, 0x3DF); ASSERT_REG_POSITION(color_mask_common, 0x3E4); ASSERT_REG_POSITION(depth_bounds, 0x3E7); ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); @@ -1597,6 +1626,7 @@ ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED); ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE); ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF); ASSERT_REG_POSITION(zeta, 0x3F8); +ASSERT_REG_POSITION(render_area, 0x3FD); ASSERT_REG_POSITION(clear_flags, 0x43E); ASSERT_REG_POSITION(fill_rectangle, 0x44F); ASSERT_REG_POSITION(vertex_attrib_format, 0x458); @@ -1605,7 +1635,8 @@ ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); ASSERT_REG_POSITION(zeta_height, 0x48b); -ASSERT_REG_POSITION(zeta_layers, 0x48c); +ASSERT_REG_POSITION(zeta_depth, 0x48c); +ASSERT_REG_POSITION(sampler_index, 0x48D); ASSERT_REG_POSITION(depth_test_enable, 0x4B3); ASSERT_REG_POSITION(independent_blend_enable, 0x4B9); ASSERT_REG_POSITION(depth_write_enabled, 0x4BA); @@ -1629,6 +1660,8 @@ ASSERT_REG_POSITION(frag_color_clamp, 0x4EA); ASSERT_REG_POSITION(screen_y_control, 0x4EB); ASSERT_REG_POSITION(line_width_smooth, 0x4EC); ASSERT_REG_POSITION(line_width_aliased, 0x4ED); +ASSERT_REG_POSITION(invalidate_sampler_cache_no_wfi, 0x509); +ASSERT_REG_POSITION(invalidate_texture_header_cache_no_wfi, 0x50A); ASSERT_REG_POSITION(vb_element_base, 0x50D); ASSERT_REG_POSITION(vb_base_instance, 0x50E); ASSERT_REG_POSITION(clip_distance_enabled, 0x544); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index e88290754..a2f19559f 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -16,8 +16,10 @@ namespace Tegra::Engines { using namespace Texture; -MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager) - : system{system}, memory_manager{memory_manager} {} +MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_) + : system{system_}, memory_manager{memory_manager_} {} + +MaxwellDMA::~MaxwellDMA() = default; void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register"); @@ -58,9 +60,6 @@ void MaxwellDMA::Launch() { return; } - // All copies here update the main memory, so mark all rasterizer states as invalid. - system.GPU().Maxwell3D().OnMemoryWrite(); - if (is_src_pitch && is_dst_pitch) { CopyPitchToPitch(); } else { @@ -94,6 +93,7 @@ void MaxwellDMA::CopyPitchToPitch() { } void MaxwellDMA::CopyBlockLinearToPitch() { + UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); UNIMPLEMENTED_IF(regs.src_params.layer != 0); @@ -114,8 +114,6 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const u32 block_depth = src_params.block_size.depth; const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t src_layer_size = - CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); if (read_buffer.size() < src_size) { read_buffer.resize(src_size); @@ -135,6 +133,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() { } void MaxwellDMA::CopyPitchToBlockLinear() { + UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); + const auto& dst_params = regs.dst_params; const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in; const u32 width = dst_params.width; diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 50f445efc..3c59eeb13 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -72,11 +72,13 @@ public: struct RenderEnable { enum class Mode : u32 { - FALSE = 0, - TRUE = 1, - CONDITIONAL = 2, - RENDER_IF_EQUAL = 3, - RENDER_IF_NOT_EQUAL = 4, + // Note: This uses Pascal case in order to avoid the identifiers + // FALSE and TRUE, which are reserved on Darwin. + False = 0, + True = 1, + Conditional = 2, + RenderIfEqual = 3, + RenderIfNotEqual = 4, }; PackedGPUVAddr address; @@ -185,8 +187,8 @@ public: }; static_assert(sizeof(RemapConst) == 12); - explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); - ~MaxwellDMA() = default; + explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_); + ~MaxwellDMA(); /// Write the value to the register identified by method. void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index d374b73cf..8b45f1b62 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -32,31 +32,31 @@ struct Register { constexpr Register() = default; - constexpr Register(u64 value) : value(value) {} + constexpr Register(u64 value_) : value(value_) {} - constexpr operator u64() const { + [[nodiscard]] constexpr operator u64() const { return value; } template <typename T> - constexpr u64 operator-(const T& oth) const { + [[nodiscard]] constexpr u64 operator-(const T& oth) const { return value - oth; } template <typename T> - constexpr u64 operator&(const T& oth) const { + [[nodiscard]] constexpr u64 operator&(const T& oth) const { return value & oth; } - constexpr u64 operator&(const Register& oth) const { + [[nodiscard]] constexpr u64 operator&(const Register& oth) const { return value & oth.value; } - constexpr u64 operator~() const { + [[nodiscard]] constexpr u64 operator~() const { return ~value; } - u64 GetSwizzledIndex(u64 elem) const { + [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const { elem = (value + elem) & 3; return (value & ~3) + elem; } @@ -75,7 +75,7 @@ enum class AttributeSize : u64 { union Attribute { Attribute() = default; - constexpr explicit Attribute(u64 value) : value(value) {} + constexpr explicit Attribute(u64 value_) : value(value_) {} enum class Index : u64 { LayerViewportPointSize = 6, @@ -107,7 +107,7 @@ union Attribute { BitField<31, 1, u64> patch; BitField<47, 3, AttributeSize> size; - bool IsPhysical() const { + [[nodiscard]] bool IsPhysical() const { return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0; } } fmt20; @@ -124,7 +124,7 @@ union Attribute { union Sampler { Sampler() = default; - constexpr explicit Sampler(u64 value) : value(value) {} + constexpr explicit Sampler(u64 value_) : value(value_) {} enum class Index : u64 { Sampler_0 = 8, @@ -137,7 +137,7 @@ union Sampler { union Image { Image() = default; - constexpr explicit Image(u64 value) : value{value} {} + constexpr explicit Image(u64 value_) : value{value_} {} BitField<36, 13, u64> index; u64 value; @@ -505,14 +505,14 @@ struct IpaMode { IpaInterpMode interpolation_mode; IpaSampleMode sampling_mode; - bool operator==(const IpaMode& a) const { + [[nodiscard]] bool operator==(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) == std::tie(a.interpolation_mode, a.sampling_mode); } - bool operator!=(const IpaMode& a) const { + [[nodiscard]] bool operator!=(const IpaMode& a) const { return !operator==(a); } - bool operator<(const IpaMode& a) const { + [[nodiscard]] bool operator<(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) < std::tie(a.interpolation_mode, a.sampling_mode); } @@ -658,10 +658,10 @@ union Instruction { return *this; } - constexpr Instruction(u64 value) : value{value} {} + constexpr Instruction(u64 value_) : value{value_} {} constexpr Instruction(const Instruction& instr) : value(instr.value) {} - constexpr bool Bit(u64 offset) const { + [[nodiscard]] constexpr bool Bit(u64 offset) const { return ((value >> offset) & 1) != 0; } @@ -746,34 +746,34 @@ union Instruction { BitField<28, 8, u64> imm_lut28; BitField<48, 8, u64> imm_lut48; - u32 GetImmLut28() const { + [[nodiscard]] u32 GetImmLut28() const { return static_cast<u32>(imm_lut28); } - u32 GetImmLut48() const { + [[nodiscard]] u32 GetImmLut48() const { return static_cast<u32>(imm_lut48); } } lop3; - u16 GetImm20_16() const { + [[nodiscard]] u16 GetImm20_16() const { return static_cast<u16>(imm20_16); } - u32 GetImm20_19() const { + [[nodiscard]] u32 GetImm20_19() const { u32 imm{static_cast<u32>(imm20_19)}; imm <<= 12; imm |= negate_imm ? 0x80000000 : 0; return imm; } - u32 GetImm20_32() const { + [[nodiscard]] u32 GetImm20_32() const { return static_cast<u32>(imm20_32); } - s32 GetSignedImm20_20() const { - u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); + [[nodiscard]] s32 GetSignedImm20_20() const { + const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); // Sign extend the 20-bit value. - u32 mask = 1U << (20 - 1); + const auto mask = 1U << (20 - 1); return static_cast<s32>((immediate ^ mask) - mask); } } alu; @@ -857,7 +857,7 @@ union Instruction { BitField<56, 1, u64> second_negate; BitField<30, 9, u64> second; - u32 PackImmediates() const { + [[nodiscard]] u32 PackImmediates() const { // Immediates are half floats shifted. constexpr u32 imm_shift = 6; return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift))); @@ -1033,7 +1033,7 @@ union Instruction { BitField<28, 2, AtomicType> type; BitField<30, 22, s64> offset; - s32 GetImmediateOffset() const { + [[nodiscard]] s32 GetImmediateOffset() const { return static_cast<s32>(offset << 2); } } atoms; @@ -1215,7 +1215,7 @@ union Instruction { BitField<39, 4, u64> rounding; // H0, H1 extract for F16 missing BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value - F2fRoundingOp GetRoundingMode() const { + [[nodiscard]] F2fRoundingOp GetRoundingMode() const { constexpr u64 rounding_mask = 0x0B; return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask); } @@ -1239,15 +1239,15 @@ union Instruction { BitField<54, 1, u64> aoffi_flag; BitField<55, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1271,15 +1271,15 @@ union Instruction { BitField<36, 1, u64> aoffi_flag; BitField<37, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1299,7 +1299,7 @@ union Instruction { BitField<31, 4, u64> component_mask; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NODEP: return nodep_flag != 0; @@ -1309,7 +1309,7 @@ union Instruction { return false; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } } txq; @@ -1321,11 +1321,11 @@ union Instruction { BitField<35, 1, u64> ndv_flag; BitField<49, 1, u64> nodep_flag; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return (ndv_flag != 0); @@ -1347,7 +1347,7 @@ union Instruction { BitField<54, 2, u64> offset_mode; BitField<56, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1373,7 +1373,7 @@ union Instruction { BitField<33, 2, u64> offset_mode; BitField<37, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1399,7 +1399,7 @@ union Instruction { BitField<52, 2, u64> component; BitField<55, 1, u64> fp16_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1422,24 +1422,27 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TEXS instruction has a weird encoding for the texture type. - if (texture_info == 0) + if (texture_info == 0) { return TextureType::Texture1D; - if (texture_info >= 1 && texture_info <= 9) + } + if (texture_info >= 1 && texture_info <= 9) { return TextureType::Texture2D; - if (texture_info >= 10 && texture_info <= 11) + } + if (texture_info >= 10 && texture_info <= 11) { return TextureType::Texture3D; - if (texture_info >= 12 && texture_info <= 13) + } + if (texture_info >= 12 && texture_info <= 13) { return TextureType::TextureCube; + } - LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", - static_cast<u32>(texture_info.Value())); + LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value()); UNREACHABLE(); return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { switch (texture_info) { case 0: case 2: @@ -1458,7 +1461,7 @@ union Instruction { return TextureProcessMode::None; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return (texture_info >= 4 && texture_info <= 6) || texture_info == 9; @@ -1470,16 +1473,16 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info >= 7 && texture_info <= 9; } - bool HasTwoDestinations() const { + [[nodiscard]] bool HasTwoDestinations() const { return gpr28.Value() != Register::ZeroIndex; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{ {}, {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc}, @@ -1506,7 +1509,7 @@ union Instruction { BitField<54, 1, u64> cl; BitField<55, 1, u64> process_mode; - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL; } } tld; @@ -1516,7 +1519,7 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TLDS instruction has a weird encoding for the texture type. if (texture_info <= 1) { return TextureType::Texture1D; @@ -1529,19 +1532,19 @@ union Instruction { return TextureType::Texture3D; } - LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", - static_cast<u32>(texture_info.Value())); + LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value()); UNREACHABLE(); return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { - if (texture_info == 1 || texture_info == 5 || texture_info == 12) + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { + if (texture_info == 1 || texture_info == 5 || texture_info == 12) { return TextureProcessMode::LL; + } return TextureProcessMode::LZ; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return texture_info == 12 || texture_info == 4; @@ -1555,7 +1558,7 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info == 8; } @@ -1567,7 +1570,7 @@ union Instruction { BitField<35, 1, u64> aoffi_flag; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return aoffi_flag != 0; @@ -1591,7 +1594,7 @@ union Instruction { BitField<20, 3, StoreType> store_data_layout; BitField<20, 4, u64> component_mask_selector; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { ASSERT(mode == SurfaceDataMode::P); constexpr u8 R = 0b0001; constexpr u8 G = 0b0010; @@ -1604,7 +1607,7 @@ union Instruction { return std::bitset<4>{mask.at(component_mask_selector)}.test(component); } - StoreType GetStoreDataLayout() const { + [[nodiscard]] StoreType GetStoreDataLayout() const { ASSERT(mode == SurfaceDataMode::D_BA); return store_data_layout; } @@ -1622,14 +1625,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchTarget() const { + [[nodiscard]] s32 GetBranchTarget() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } bra; @@ -1637,14 +1641,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchExtend() const { + [[nodiscard]] s32 GetBranchExtend() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } brx; @@ -1697,7 +1702,7 @@ union Instruction { BitField<50, 1, u64> is_op_b_register; BitField<51, 3, VmnmxOperation> operation; - VmnmxType SourceFormatA() const { + [[nodiscard]] VmnmxType SourceFormatA() const { switch (src_format_a) { case 0b11: return VmnmxType::Bits32; @@ -1708,7 +1713,7 @@ union Instruction { } } - VmnmxType SourceFormatB() const { + [[nodiscard]] VmnmxType SourceFormatB() const { switch (src_format_b) { case 0b11: return VmnmxType::Bits32; @@ -1739,7 +1744,7 @@ union Instruction { BitField<20, 14, u64> shifted_offset; BitField<34, 5, u64> index; - u64 GetOffset() const { + [[nodiscard]] u64 GetOffset() const { return shifted_offset * 4; } } cbuf34; @@ -1748,7 +1753,7 @@ union Instruction { BitField<20, 16, s64> offset; BitField<36, 5, u64> index; - s64 GetOffset() const { + [[nodiscard]] s64 GetOffset() const { return offset; } } cbuf36; @@ -1893,6 +1898,7 @@ public: ICMP_IMM, FCMP_RR, FCMP_RC, + FCMP_IMMR, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -1996,29 +2002,29 @@ public: /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be /// conditionally executed). - static bool IsPredicatedInstruction(Id opcode) { + [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) { // TODO(Subv): Add the rest of unpredicated instructions. return opcode != Id::SSY && opcode != Id::PBK; } class Matcher { public: - constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type) - : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {} + constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_) + : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {} - constexpr const char* GetName() const { + [[nodiscard]] constexpr const char* GetName() const { return name; } - constexpr u16 GetMask() const { + [[nodiscard]] constexpr u16 GetMask() const { return mask; } - constexpr Id GetId() const { + [[nodiscard]] constexpr Id GetId() const { return id; } - constexpr Type GetType() const { + [[nodiscard]] constexpr Type GetType() const { return type; } @@ -2027,7 +2033,7 @@ public: * @param instruction The instruction to test * @returns true if the given instruction matches. */ - constexpr bool Matches(u16 instruction) const { + [[nodiscard]] constexpr bool Matches(u16 instruction) const { return (instruction & mask) == expected; } @@ -2039,7 +2045,8 @@ public: Type type; }; - static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) { + using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>; + [[nodiscard]] static DecodeResult Decode(Instruction instr) { static const auto table{GetDecodeTable()}; const auto matches_instruction = [instr](const auto& matcher) { @@ -2061,7 +2068,7 @@ private: * A '0' in a bitstring indicates that a zero must be present at that bit position. * A '1' in a bitstring indicates that a one must be present at that bit position. */ - static constexpr auto GetMaskAndExpect(const char* const bitstring) { + [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) { u16 mask = 0, expect = 0; for (std::size_t i = 0; i < opcode_bitsize; i++) { const std::size_t bit_position = opcode_bitsize - i - 1; @@ -2083,14 +2090,14 @@ private: public: /// Creates a matcher that can match and parse instructions based on bitstring. - static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type, - const char* const name) { + [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op, + Type type, const char* const name) { const auto [mask, expected] = GetMaskAndExpect(bitstring); return Matcher(name, mask, expected, op, type); } }; - static std::vector<Matcher> GetDecodeTable() { + [[nodiscard]] static std::vector<Matcher> GetDecodeTable() { std::vector<Matcher> table = { #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name) INST("111000110011----", Id::KIL, Type::Flow, "KIL"), @@ -2205,6 +2212,7 @@ private: INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"), INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"), INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"), + INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h index 72e2a33d5..e0d7b89c5 100644 --- a/src/video_core/engines/shader_header.h +++ b/src/video_core/engines/shader_header.h @@ -41,37 +41,37 @@ struct Header { BitField<26, 1, u32> does_load_or_store; BitField<27, 1, u32> does_fp64; BitField<28, 4, u32> stream_out_mask; - } common0{}; + } common0; union { BitField<0, 24, u32> shader_local_memory_low_size; BitField<24, 8, u32> per_patch_attribute_count; - } common1{}; + } common1; union { BitField<0, 24, u32> shader_local_memory_high_size; BitField<24, 8, u32> threads_per_input_primitive; - } common2{}; + } common2; union { BitField<0, 24, u32> shader_local_memory_crs_size; BitField<24, 4, OutputTopology> output_topology; BitField<28, 4, u32> reserved; - } common3{}; + } common3; union { BitField<0, 12, u32> max_output_vertices; BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders. BitField<20, 4, u32> reserved; BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders. - } common4{}; + } common4; union { struct { - INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA - INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB - INSERT_UNION_PADDING_BYTES(16); // ImapGenericVector[32] - INSERT_UNION_PADDING_BYTES(2); // ImapColor + INSERT_PADDING_BYTES_NOINIT(3); // ImapSystemValuesA + INSERT_PADDING_BYTES_NOINIT(1); // ImapSystemValuesB + INSERT_PADDING_BYTES_NOINIT(16); // ImapGenericVector[32] + INSERT_PADDING_BYTES_NOINIT(2); // ImapColor union { BitField<0, 8, u16> clip_distances; BitField<8, 1, u16> point_sprite_s; @@ -82,20 +82,20 @@ struct Header { BitField<14, 1, u16> instance_id; BitField<15, 1, u16> vertex_id; }; - INSERT_UNION_PADDING_BYTES(5); // ImapFixedFncTexture[10] - INSERT_UNION_PADDING_BYTES(1); // ImapReserved - INSERT_UNION_PADDING_BYTES(3); // OmapSystemValuesA - INSERT_UNION_PADDING_BYTES(1); // OmapSystemValuesB - INSERT_UNION_PADDING_BYTES(16); // OmapGenericVector[32] - INSERT_UNION_PADDING_BYTES(2); // OmapColor - INSERT_UNION_PADDING_BYTES(2); // OmapSystemValuesC - INSERT_UNION_PADDING_BYTES(5); // OmapFixedFncTexture[10] - INSERT_UNION_PADDING_BYTES(1); // OmapReserved + INSERT_PADDING_BYTES_NOINIT(5); // ImapFixedFncTexture[10] + INSERT_PADDING_BYTES_NOINIT(1); // ImapReserved + INSERT_PADDING_BYTES_NOINIT(3); // OmapSystemValuesA + INSERT_PADDING_BYTES_NOINIT(1); // OmapSystemValuesB + INSERT_PADDING_BYTES_NOINIT(16); // OmapGenericVector[32] + INSERT_PADDING_BYTES_NOINIT(2); // OmapColor + INSERT_PADDING_BYTES_NOINIT(2); // OmapSystemValuesC + INSERT_PADDING_BYTES_NOINIT(5); // OmapFixedFncTexture[10] + INSERT_PADDING_BYTES_NOINIT(1); // OmapReserved } vtg; struct { - INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA - INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB + INSERT_PADDING_BYTES_NOINIT(3); // ImapSystemValuesA + INSERT_PADDING_BYTES_NOINIT(1); // ImapSystemValuesB union { BitField<0, 2, PixelImap> x; @@ -105,10 +105,10 @@ struct Header { u8 raw; } imap_generic_vector[32]; - INSERT_UNION_PADDING_BYTES(2); // ImapColor - INSERT_UNION_PADDING_BYTES(2); // ImapSystemValuesC - INSERT_UNION_PADDING_BYTES(10); // ImapFixedFncTexture[10] - INSERT_UNION_PADDING_BYTES(2); // ImapReserved + INSERT_PADDING_BYTES_NOINIT(2); // ImapColor + INSERT_PADDING_BYTES_NOINIT(2); // ImapSystemValuesC + INSERT_PADDING_BYTES_NOINIT(10); // ImapFixedFncTexture[10] + INSERT_PADDING_BYTES_NOINIT(2); // ImapReserved struct { u32 target; @@ -145,7 +145,7 @@ struct Header { } } ps; - std::array<u32, 0xF> raw{}; + std::array<u32, 0xF> raw; }; u64 GetLocalMemorySize() const { @@ -153,7 +153,6 @@ struct Header { (common2.shader_local_memory_high_size << 24)); } }; - static_assert(sizeof(Header) == 0x50, "Incorrect structure size"); } // namespace Tegra::Shader diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 06cc12d5a..f055b61e9 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -9,6 +9,7 @@ #include "common/common_types.h" #include "core/core.h" +#include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" @@ -17,11 +18,11 @@ namespace VideoCommon { class FenceBase { public: - FenceBase(u32 payload, bool is_stubbed) - : address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {} + explicit FenceBase(u32 payload_, bool is_stubbed_) + : address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {} - FenceBase(GPUVAddr address, u32 payload, bool is_stubbed) - : address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {} + explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_) + : address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {} GPUVAddr GetAddress() const { return address; @@ -47,6 +48,11 @@ protected: template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache> class FenceManager { public: + /// Notify the fence manager about a new frame + void TickFrame() { + delayed_destruction_ring.Tick(); + } + void SignalSemaphore(GPUVAddr addr, u32 value) { TryReleasePendingFences(); const bool should_flush = ShouldFlush(); @@ -74,8 +80,6 @@ public: } void WaitPendingFences() { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; while (!fences.empty()) { TFence& current_fence = fences.front(); if (ShouldWait()) { @@ -83,23 +87,23 @@ public: } PopAsyncFlushes(); if (current_fence->IsSemaphore()) { - memory_manager.template Write<u32>(current_fence->GetAddress(), - current_fence->GetPayload()); + gpu_memory.template Write<u32>(current_fence->GetAddress(), + current_fence->GetPayload()); } else { gpu.IncrementSyncPoint(current_fence->GetPayload()); } - fences.pop(); + PopFence(); } } protected: - FenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - TTextureCache& texture_cache, TTBufferCache& buffer_cache, - TQueryCache& query_cache) - : system{system}, rasterizer{rasterizer}, texture_cache{texture_cache}, - buffer_cache{buffer_cache}, query_cache{query_cache} {} + explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, + TQueryCache& query_cache_) + : rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()}, + texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {} - virtual ~FenceManager() {} + virtual ~FenceManager() = default; /// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is /// true @@ -113,16 +117,15 @@ protected: /// Waits until a fence has been signalled by the host GPU. virtual void WaitFence(TFence& fence) = 0; - Core::System& system; VideoCore::RasterizerInterface& rasterizer; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; TTextureCache& texture_cache; TTBufferCache& buffer_cache; TQueryCache& query_cache; private: void TryReleasePendingFences() { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; while (!fences.empty()) { TFence& current_fence = fences.front(); if (ShouldWait() && !IsFenceSignaled(current_fence)) { @@ -130,38 +133,49 @@ private: } PopAsyncFlushes(); if (current_fence->IsSemaphore()) { - memory_manager.template Write<u32>(current_fence->GetAddress(), - current_fence->GetPayload()); + gpu_memory.template Write<u32>(current_fence->GetAddress(), + current_fence->GetPayload()); } else { gpu.IncrementSyncPoint(current_fence->GetPayload()); } - fences.pop(); + PopFence(); } } bool ShouldWait() const { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || query_cache.ShouldWaitAsyncFlushes(); } bool ShouldFlush() const { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() || query_cache.HasUncommittedFlushes(); } void PopAsyncFlushes() { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.PopAsyncFlushes(); buffer_cache.PopAsyncFlushes(); query_cache.PopAsyncFlushes(); } void CommitAsyncFlushes() { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.CommitAsyncFlushes(); buffer_cache.CommitAsyncFlushes(); query_cache.CommitAsyncFlushes(); } + void PopFence() { + delayed_destruction_ring.Push(std::move(fences.front())); + fences.pop(); + } + std::queue<TFence> fences; + + DelayedDestructionRing<TFence, 6> delayed_destruction_ring; }; } // namespace VideoCommon diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h new file mode 100644 index 000000000..b86c3a757 --- /dev/null +++ b/src/video_core/framebuffer_config.h @@ -0,0 +1,31 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +namespace Tegra { + +/** + * Struct describing framebuffer configuration + */ +struct FramebufferConfig { + enum class PixelFormat : u32 { + A8B8G8R8_UNORM = 1, + RGB565_UNORM = 4, + B8G8R8A8_UNORM = 5, + }; + + VAddr address{}; + u32 offset{}; + u32 width{}; + u32 height{}; + u32 stride{}; + PixelFormat pixel_format{}; + + using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; + TransformFlags transform_flags{}; + Common::Rectangle<int> crop_rect; +}; + +} // namespace Tegra diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index acb6e6d46..2a9bd4121 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -10,6 +10,7 @@ #include "core/core_timing.h" #include "core/core_timing_util.h" #include "core/frontend/emu_window.h" +#include "core/hardware_interrupt_manager.h" #include "core/memory.h" #include "core/settings.h" #include "video_core/engines/fermi_2d.h" @@ -27,22 +28,24 @@ namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_) - : system{system_}, dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, - memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, +GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) + : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, + dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, + cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_}, maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, fermi_2d{std::make_unique<Engines::Fermi2D>()}, kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, - shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {} + shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, + gpu_thread{system_, is_async_} {} GPU::~GPU() = default; void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { renderer = std::move(renderer_); + rasterizer = renderer->ReadRasterizer(); - VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer(); memory_manager->BindRasterizer(rasterizer); maxwell_3d->BindRasterizer(rasterizer); fermi_2d->BindRasterizer(rasterizer); @@ -77,31 +80,46 @@ DmaPusher& GPU::DmaPusher() { return *dma_pusher; } +Tegra::CDmaPusher& GPU::CDmaPusher() { + return *cdma_pusher; +} + const DmaPusher& GPU::DmaPusher() const { return *dma_pusher; } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return *cdma_pusher; +} + void GPU::WaitFence(u32 syncpoint_id, u32 value) { // Synced GPU, is always in sync if (!is_async) { return; } + if (syncpoint_id == UINT32_MAX) { + // TODO: Research what this does. + LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); + return; + } MICROPROFILE_SCOPE(GPU_wait); std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; }); + sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; }); } void GPU::IncrementSyncPoint(const u32 syncpoint_id) { - syncpoints[syncpoint_id]++; + auto& syncpoint = syncpoints.at(syncpoint_id); + syncpoint++; std::lock_guard lock{sync_mutex}; sync_cv.notify_all(); - if (!syncpt_interrupts[syncpoint_id].empty()) { - u32 value = syncpoints[syncpoint_id].load(); - auto it = syncpt_interrupts[syncpoint_id].begin(); - while (it != syncpt_interrupts[syncpoint_id].end()) { + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + if (!interrupt.empty()) { + u32 value = syncpoint.load(); + auto it = interrupt.begin(); + while (it != interrupt.end()) { if (value >= *it) { TriggerCpuInterrupt(syncpoint_id, *it); - it = syncpt_interrupts[syncpoint_id].erase(it); + it = interrupt.erase(it); continue; } it++; @@ -110,22 +128,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) { } u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { - return syncpoints[syncpoint_id].load(); + return syncpoints.at(syncpoint_id).load(); } void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); bool contains = std::any_of(interrupt.begin(), interrupt.end(), [value](u32 in_value) { return in_value == value; }); if (contains) { return; } - syncpt_interrupts[syncpoint_id].emplace_back(value); + interrupt.emplace_back(value); } bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { std::lock_guard lock{sync_mutex}; - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); const auto iter = std::find_if(interrupt.begin(), interrupt.end(), [value](u32 interrupt_value) { return value == interrupt_value; }); @@ -153,7 +171,7 @@ void GPU::TickWork() { const std::size_t size = request.size; flush_requests.pop_front(); flush_request_mutex.unlock(); - renderer->Rasterizer().FlushRegion(addr, size); + rasterizer->FlushRegion(addr, size); current_flush_fence.store(fence); flush_request_mutex.lock(); } @@ -175,41 +193,13 @@ u64 GPU::GetTicks() const { } void GPU::FlushCommands() { - renderer->Rasterizer().FlushCommands(); + rasterizer->FlushCommands(); } void GPU::SyncGuestHost() { - renderer->Rasterizer().SyncGuestHost(); + rasterizer->SyncGuestHost(); } -void GPU::OnCommandListEnd() { - renderer->Rasterizer().ReleaseFences(); -} -// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence -// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. -// So the values you see in docs might be multiplied by 4. -enum class BufferMethods { - BindObject = 0x0, - Nop = 0x2, - SemaphoreAddressHigh = 0x4, - SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, - WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, - RefCnt = 0x14, - SemaphoreAcquire = 0x1A, - SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - Unk78 = 0x1E, - Unk7c = 0x1F, - Yield = 0x20, - NonPullerMethods = 0x40, -}; - enum class GpuSemaphoreOperation { AcquireEqual = 0x1, WriteLong = 0x2, @@ -240,8 +230,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); } else { for (std::size_t i = 0; i < amount; i++) { - CallPullerMethod( - {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)}); + CallPullerMethod(MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast<u32>(i), + }); } } } @@ -268,7 +262,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::UnkCacheFlush: case BufferMethods::WrcacheFlush: case BufferMethods::FenceValue: + break; case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -298,8 +297,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { break; } default: - LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", - static_cast<u32>(method)); + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); break; } } @@ -378,10 +376,28 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); break; default: - UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id)); + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); + } +} + +void GPU::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); } } +void GPU::ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); +} + void GPU::ProcessSemaphoreTriggerMethod() { const auto semaphoreOperationMask = 0xF; const auto op = @@ -443,4 +459,75 @@ void GPU::ProcessSemaphoreAcquire() { } } +void GPU::Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); + cpu_context->MakeCurrent(); +} + +void GPU::ObtainContext() { + cpu_context->MakeCurrent(); +} + +void GPU::ReleaseContext() { + cpu_context->DoneCurrent(); +} + +void GPU::PushGPUEntries(Tegra::CommandList&& entries) { + gpu_thread.SubmitList(std::move(entries)); +} + +void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + // This condition fires when a video stream ends, clear all intermediary data + if (entries[0].raw == 0xDEADB33F) { + cdma_pusher.reset(); + return; + } + if (!cdma_pusher) { + cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->Push(std::move(entries)); + cdma_pusher->DispatchCalls(); +} + +void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + gpu_thread.SwapBuffers(framebuffer); +} + +void GPU::FlushRegion(VAddr addr, u64 size) { + gpu_thread.FlushRegion(addr, size); +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + gpu_thread.InvalidateRegion(addr, size); +} + +void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { + gpu_thread.FlushAndInvalidateRegion(addr, size); +} + +void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +} + +void GPU::WaitIdle() const { + gpu_thread.WaitIdle(); +} + +void GPU::OnCommandListEnd() { + if (is_async) { + // This command only applies to asynchronous GPU mode + gpu_thread.OnCommandListEnd(); + } +} + } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index c7d11deb2..b2ee45496 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -13,14 +13,17 @@ #include "common/common_types.h" #include "core/hle/service/nvdrv/nvdata.h" #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/cdma_pusher.h" #include "video_core/dma_pusher.h" +#include "video_core/framebuffer_config.h" +#include "video_core/gpu_thread.h" using CacheAddr = std::uintptr_t; -inline CacheAddr ToCacheAddr(const void* host_ptr) { +[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { return reinterpret_cast<CacheAddr>(host_ptr); } -inline u8* FromCacheAddr(CacheAddr cache_addr) { +[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) { return reinterpret_cast<u8*>(cache_addr); } @@ -100,28 +103,6 @@ enum class DepthFormat : u32 { struct CommandListHeader; class DebugContext; -/** - * Struct describing framebuffer configuration - */ -struct FramebufferConfig { - enum class PixelFormat : u32 { - A8B8G8R8_UNORM = 1, - RGB565_UNORM = 4, - B8G8R8A8_UNORM = 5, - }; - - VAddr address; - u32 offset; - u32 width; - u32 height; - u32 stride; - PixelFormat pixel_format; - - using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; - TransformFlags transform_flags; - Common::Rectangle<int> crop_rect; -}; - namespace Engines { class Fermi2D; class Maxwell3D; @@ -140,7 +121,7 @@ enum class EngineID { class MemoryManager; -class GPU { +class GPU final { public: struct MethodCall { u32 method{}; @@ -148,17 +129,17 @@ public: u32 subchannel{}; u32 method_count{}; - bool IsLastCall() const { + explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0) + : method(method_), argument(argument_), subchannel(subchannel_), + method_count(method_count_) {} + + [[nodiscard]] bool IsLastCall() const { return method_count <= 1; } - - MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0) - : method(method), argument(argument), subchannel(subchannel), - method_count(method_count) {} }; - explicit GPU(Core::System& system, bool is_async); - virtual ~GPU(); + explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_); + ~GPU(); /// Binds a renderer to the GPU. void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer); @@ -175,13 +156,13 @@ public: /// Synchronizes CPU writes with Host GPU memory. void SyncGuestHost(); /// Signal the ending of command list. - virtual void OnCommandListEnd(); + void OnCommandListEnd(); /// Request a host GPU memory flush from the CPU. - u64 RequestFlush(VAddr addr, std::size_t size); + [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); /// Obtains current flush request fence id. - u64 CurrentFlushRequestFence() const { + [[nodiscard]] u64 CurrentFlushRequestFence() const { return current_flush_fence.load(std::memory_order_relaxed); } @@ -189,80 +170,112 @@ public: void TickWork(); /// Returns a reference to the Maxwell3D GPU engine. - Engines::Maxwell3D& Maxwell3D(); + [[nodiscard]] Engines::Maxwell3D& Maxwell3D(); /// Returns a const reference to the Maxwell3D GPU engine. - const Engines::Maxwell3D& Maxwell3D() const; + [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const; /// Returns a reference to the KeplerCompute GPU engine. - Engines::KeplerCompute& KeplerCompute(); + [[nodiscard]] Engines::KeplerCompute& KeplerCompute(); /// Returns a reference to the KeplerCompute GPU engine. - const Engines::KeplerCompute& KeplerCompute() const; + [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const; /// Returns a reference to the GPU memory manager. - Tegra::MemoryManager& MemoryManager(); + [[nodiscard]] Tegra::MemoryManager& MemoryManager(); /// Returns a const reference to the GPU memory manager. - const Tegra::MemoryManager& MemoryManager() const; + [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const; /// Returns a reference to the GPU DMA pusher. - Tegra::DmaPusher& DmaPusher(); + [[nodiscard]] Tegra::DmaPusher& DmaPusher(); + + /// Returns a const reference to the GPU DMA pusher. + [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const; - VideoCore::RendererBase& Renderer() { + /// Returns a reference to the GPU CDMA pusher. + [[nodiscard]] Tegra::CDmaPusher& CDmaPusher(); + + /// Returns a const reference to the GPU CDMA pusher. + [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const; + + /// Returns a reference to the underlying renderer. + [[nodiscard]] VideoCore::RendererBase& Renderer() { return *renderer; } - const VideoCore::RendererBase& Renderer() const { + /// Returns a const reference to the underlying renderer. + [[nodiscard]] const VideoCore::RendererBase& Renderer() const { return *renderer; } - VideoCore::ShaderNotify& ShaderNotify() { + /// Returns a reference to the shader notifier. + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { return *shader_notify; } - const VideoCore::ShaderNotify& ShaderNotify() const { + /// Returns a const reference to the shader notifier. + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { return *shader_notify; } // Waits for the GPU to finish working - virtual void WaitIdle() const = 0; + void WaitIdle() const; /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); void IncrementSyncPoint(u32 syncpoint_id); - u32 GetSyncpointValue(u32 syncpoint_id) const; + [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const; void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value); - bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); - u64 GetTicks() const; + [[nodiscard]] u64 GetTicks() const; - std::unique_lock<std::mutex> LockSync() { + [[nodiscard]] std::unique_lock<std::mutex> LockSync() { return std::unique_lock{sync_mutex}; } - bool IsAsync() const { + [[nodiscard]] bool IsAsync() const { return is_async; } - /// Returns a const reference to the GPU DMA pusher. - const Tegra::DmaPusher& DmaPusher() const; + [[nodiscard]] bool UseNvdec() const { + return use_nvdec; + } + + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + + [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { + FenceAction result{}; + result.op.Assign(op); + result.syncpoint_id.Assign(syncpoint_id); + return {result.raw}; + } + }; struct Regs { static constexpr size_t NUM_REGS = 0x40; union { struct { - INSERT_UNION_PADDING_WORDS(0x4); + INSERT_PADDING_WORDS_NOINIT(0x4); struct { u32 address_high; u32 address_low; - GPUVAddr SemaphoreAddress() const { + [[nodiscard]] GPUVAddr SemaphoreAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } @@ -270,21 +283,18 @@ public: u32 semaphore_sequence; u32 semaphore_trigger; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_PADDING_WORDS_NOINIT(0xC); // The pusher and the puller share the reference counter, the pusher only has read // access u32 reference_count; - INSERT_UNION_PADDING_WORDS(0x5); + INSERT_PADDING_WORDS_NOINIT(0x5); u32 semaphore_acquire; u32 semaphore_release; u32 fence_value; - union { - BitField<4, 4, u32> operation; - BitField<8, 8, u32> id; - } fence_action; - INSERT_UNION_PADDING_WORDS(0xE2); + FenceAction fence_action; + INSERT_PADDING_WORDS_NOINIT(0xE2); // Puller state u32 acquire_mode; @@ -300,34 +310,39 @@ public: /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. - virtual void Start() = 0; + void Start(); /// Obtain the CPU Context - virtual void ObtainContext() = 0; + void ObtainContext(); /// Release the CPU Context - virtual void ReleaseContext() = 0; + void ReleaseContext(); /// Push GPU command entries to be processed - virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; + void PushGPUEntries(Tegra::CommandList&& entries); + + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries); /// Swap buffers (render frame) - virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory - virtual void FlushRegion(VAddr addr, u64 size) = 0; + void FlushRegion(VAddr addr, u64 size); /// Notify rasterizer that any caches of the specified region should be invalidated - virtual void InvalidateRegion(VAddr addr, u64 size) = 0; + void InvalidateRegion(VAddr addr, u64 size); /// Notify rasterizer that any caches of the specified region should be flushed and invalidated - virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; + void FlushAndInvalidateRegion(VAddr addr, u64 size); protected: - virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0; + void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const; private: void ProcessBindMethod(const MethodCall& method_call); + void ProcessFenceActionMethod(); + void ProcessWaitForInterruptMethod(); void ProcessSemaphoreTriggerMethod(); void ProcessSemaphoreRelease(); void ProcessSemaphoreAcquire(); @@ -343,16 +358,18 @@ private: u32 methods_pending); /// Determines where the method should be executed. - bool ExecuteMethodOnEngine(u32 method); + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); protected: Core::System& system; + std::unique_ptr<Tegra::MemoryManager> memory_manager; std::unique_ptr<Tegra::DmaPusher> dma_pusher; + std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; std::unique_ptr<VideoCore::RendererBase> renderer; + VideoCore::RasterizerInterface* rasterizer = nullptr; + const bool use_nvdec; private: - std::unique_ptr<Tegra::MemoryManager> memory_manager; - /// Mapping of command subchannels to their bound engine ids std::array<EngineID, 8> bound_engines = {}; /// 3D engine @@ -373,12 +390,13 @@ private: std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; std::mutex sync_mutex; + std::mutex device_mutex; std::condition_variable sync_cv; struct FlushRequest { - FlushRequest(u64 fence, VAddr addr, std::size_t size) - : fence{fence}, addr{addr}, size{size} {} + explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) + : fence{fence_}, addr{addr_}, size{size_} {} u64 fence; VAddr addr; std::size_t size; @@ -390,6 +408,9 @@ private: std::mutex flush_request_mutex; const bool is_async; + + VideoCommon::GPUThread::ThreadManager gpu_thread; + std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp deleted file mode 100644 index 70a3d5738..000000000 --- a/src/video_core/gpu_asynch.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "core/core.h" -#include "core/hardware_interrupt_manager.h" -#include "video_core/gpu_asynch.h" -#include "video_core/gpu_thread.h" -#include "video_core/renderer_base.h" - -namespace VideoCommon { - -GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {} - -GPUAsynch::~GPUAsynch() = default; - -void GPUAsynch::Start() { - gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); - cpu_context = renderer->GetRenderWindow().CreateSharedContext(); - cpu_context->MakeCurrent(); -} - -void GPUAsynch::ObtainContext() { - cpu_context->MakeCurrent(); -} - -void GPUAsynch::ReleaseContext() { - cpu_context->DoneCurrent(); -} - -void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { - gpu_thread.SubmitList(std::move(entries)); -} - -void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - gpu_thread.SwapBuffers(framebuffer); -} - -void GPUAsynch::FlushRegion(VAddr addr, u64 size) { - gpu_thread.FlushRegion(addr, size); -} - -void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) { - gpu_thread.InvalidateRegion(addr, size); -} - -void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { - gpu_thread.FlushAndInvalidateRegion(addr, size); -} - -void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { - auto& interrupt_manager = system.InterruptManager(); - interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); -} - -void GPUAsynch::WaitIdle() const { - gpu_thread.WaitIdle(); -} - -void GPUAsynch::OnCommandListEnd() { - gpu_thread.OnCommandListEnd(); -} - -} // namespace VideoCommon diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h deleted file mode 100644 index f89c855a5..000000000 --- a/src/video_core/gpu_asynch.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/gpu.h" -#include "video_core/gpu_thread.h" - -namespace Core::Frontend { -class GraphicsContext; -} - -namespace VideoCore { -class RendererBase; -} // namespace VideoCore - -namespace VideoCommon { - -/// Implementation of GPU interface that runs the GPU asynchronously -class GPUAsynch final : public Tegra::GPU { -public: - explicit GPUAsynch(Core::System& system); - ~GPUAsynch() override; - - void Start() override; - void ObtainContext() override; - void ReleaseContext() override; - void PushGPUEntries(Tegra::CommandList&& entries) override; - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - void FlushRegion(VAddr addr, u64 size) override; - void InvalidateRegion(VAddr addr, u64 size) override; - void FlushAndInvalidateRegion(VAddr addr, u64 size) override; - void WaitIdle() const override; - - void OnCommandListEnd() override; - -protected: - void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override; - -private: - GPUThread::ThreadManager gpu_thread; - std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; -}; - -} // namespace VideoCommon diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp deleted file mode 100644 index 1ca47ddef..000000000 --- a/src/video_core/gpu_synch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/gpu_synch.h" -#include "video_core/renderer_base.h" - -namespace VideoCommon { - -GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {} - -GPUSynch::~GPUSynch() = default; - -void GPUSynch::Start() {} - -void GPUSynch::ObtainContext() { - renderer->Context().MakeCurrent(); -} - -void GPUSynch::ReleaseContext() { - renderer->Context().DoneCurrent(); -} - -void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { - dma_pusher->Push(std::move(entries)); - dma_pusher->DispatchCalls(); -} - -void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - renderer->SwapBuffers(framebuffer); -} - -void GPUSynch::FlushRegion(VAddr addr, u64 size) { - renderer->Rasterizer().FlushRegion(addr, size); -} - -void GPUSynch::InvalidateRegion(VAddr addr, u64 size) { - renderer->Rasterizer().InvalidateRegion(addr, size); -} - -void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { - renderer->Rasterizer().FlushAndInvalidateRegion(addr, size); -} - -} // namespace VideoCommon diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h deleted file mode 100644 index 297258cb1..000000000 --- a/src/video_core/gpu_synch.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/gpu.h" - -namespace Core::Frontend { -class GraphicsContext; -} - -namespace VideoCore { -class RendererBase; -} // namespace VideoCore - -namespace VideoCommon { - -/// Implementation of GPU interface that runs the GPU synchronously -class GPUSynch final : public Tegra::GPU { -public: - explicit GPUSynch(Core::System& system); - ~GPUSynch() override; - - void Start() override; - void ObtainContext() override; - void ReleaseContext() override; - void PushGPUEntries(Tegra::CommandList&& entries) override; - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - void FlushRegion(VAddr addr, u64 size) override; - void InvalidateRegion(VAddr addr, u64 size) override; - void FlushAndInvalidateRegion(VAddr addr, u64 size) override; - void WaitIdle() const override {} - -protected: - void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id, - [[maybe_unused]] u32 value) const override {} -}; - -} // namespace VideoCommon diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index bf761abf2..50319f1d5 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,6 +4,7 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "common/scope_exit.h" #include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" @@ -18,9 +19,11 @@ namespace VideoCommon::GPUThread { /// Runs the GPU thread static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, - SynchState& state) { + SynchState& state, Tegra::CDmaPusher& cdma_pusher) { std::string name = "yuzu:GPU"; MicroProfileOnThreadCreate(name.c_str()); + SCOPE_EXIT({ MicroProfileOnThreadExit(); }); + Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadPriority(Common::ThreadPriority::High); system.RegisterHostThread(); @@ -35,23 +38,28 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } auto current_context = context.Acquire(); + VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer(); CommandDataContainer next; while (state.is_running) { next = state.queue.PopWait(); - if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { + if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) { dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.DispatchCalls(); - } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { + } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { + // NVDEC + cdma_pusher.Push(std::move(command_list->entries)); + cdma_pusher.DispatchCalls(); + } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { - renderer.Rasterizer().ReleaseFences(); + rasterizer->ReleaseFences(); } else if (std::holds_alternative<GPUTickCommand>(next.data)) { system.GPU().TickWork(); - } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) { - renderer.Rasterizer().FlushRegion(data->addr, data->size); - } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) { - renderer.Rasterizer().OnCPUWrite(data->addr, data->size); + } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) { + rasterizer->FlushRegion(flush->addr, flush->size); + } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) { + rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); } else if (std::holds_alternative<EndProcessingCommand>(next.data)) { return; } else { @@ -61,7 +69,8 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } } -ThreadManager::ThreadManager(Core::System& system) : system{system} {} +ThreadManager::ThreadManager(Core::System& system_, bool is_async_) + : system{system_}, is_async{is_async_} {} ThreadManager::~ThreadManager() { if (!thread.joinable()) { @@ -75,47 +84,64 @@ ThreadManager::~ThreadManager() { void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher) { - thread = std::thread{RunThread, std::ref(system), std::ref(renderer), - std::ref(context), std::ref(dma_pusher), std::ref(state)}; + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { + rasterizer = renderer.ReadRasterizer(); + thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), + std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher)); } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { PushCommand(SubmitListCommand(std::move(entries))); } +void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) { + PushCommand(SubmitChCommandEntries(std::move(entries))); +} + void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt)); } void ThreadManager::FlushRegion(VAddr addr, u64 size) { - if (!Settings::IsGPULevelHigh()) { + if (!is_async) { + // Always flush with synchronous GPU mode PushCommand(FlushRegionCommand(addr, size)); return; } - if (!Settings::IsGPULevelExtreme()) { - return; - } - if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) { + + // Asynchronous GPU mode + switch (Settings::values.gpu_accuracy.GetValue()) { + case Settings::GPUAccuracy::Normal: + PushCommand(FlushRegionCommand(addr, size)); + break; + case Settings::GPUAccuracy::High: + // TODO(bunnei): Is this right? Preserving existing behavior for now + break; + case Settings::GPUAccuracy::Extreme: { auto& gpu = system.GPU(); u64 fence = gpu.RequestFlush(addr, size); PushCommand(GPUTickCommand()); while (fence > gpu.CurrentFlushRequestFence()) { } + break; + } + default: + UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue()); } } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { - system.Renderer().Rasterizer().OnCPUWrite(addr, size); + rasterizer->OnCPUWrite(addr, size); } void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important - system.Renderer().Rasterizer().OnCPUWrite(addr, size); + rasterizer->OnCPUWrite(addr, size); } void ThreadManager::WaitIdle() const { - while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) { + while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) && + system.IsPoweredOn()) { } } @@ -126,6 +152,12 @@ void ThreadManager::OnCommandListEnd() { u64 ThreadManager::PushCommand(CommandData&& command_data) { const u64 fence{++state.last_fence}; state.queue.Push(CommandDataContainer(std::move(command_data), fence)); + + if (!is_async) { + // In synchronous GPU mode, block the caller until the command has executed + WaitIdle(); + } + return fence; } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 5a28335d6..4cd951169 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -10,8 +10,9 @@ #include <optional> #include <thread> #include <variant> + #include "common/threadsafe_queue.h" -#include "video_core/gpu.h" +#include "video_core/framebuffer_config.h" namespace Tegra { struct FramebufferConfig; @@ -25,6 +26,11 @@ class GraphicsContext; class System; } // namespace Core +namespace VideoCore { +class RasterizerInterface; +class RendererBase; +} // namespace VideoCore + namespace VideoCommon::GPUThread { /// Command to signal to the GPU thread that processing has ended @@ -32,22 +38,30 @@ struct EndProcessingCommand final {}; /// Command to signal to the GPU thread that a command list is ready for processing struct SubmitListCommand final { - explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {} + explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {} Tegra::CommandList entries; }; +/// Command to signal to the GPU thread that a cdma command list is ready for processing +struct SubmitChCommandEntries final { + explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_) + : entries{std::move(entries_)} {} + + Tegra::ChCommandHeaderList entries; +}; + /// Command to signal to the GPU thread that a swap buffers is pending struct SwapBuffersCommand final { - explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) - : framebuffer{std::move(framebuffer)} {} + explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_) + : framebuffer{std::move(framebuffer_)} {} std::optional<Tegra::FramebufferConfig> framebuffer; }; /// Command to signal to the GPU thread to flush a region struct FlushRegionCommand final { - explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -55,7 +69,7 @@ struct FlushRegionCommand final { /// Command to signal to the GPU thread to invalidate a region struct InvalidateRegionCommand final { - explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -63,8 +77,8 @@ struct InvalidateRegionCommand final { /// Command to signal to the GPU thread to flush and invalidate a region struct FlushAndInvalidateRegionCommand final { - explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size) - : addr{addr}, size{size} {} + explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_) + : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -77,15 +91,15 @@ struct OnCommandListEndCommand final {}; struct GPUTickCommand final {}; using CommandData = - std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, - InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand, - GPUTickCommand>; + std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries, + SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand, + FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>; struct CommandDataContainer { CommandDataContainer() = default; - CommandDataContainer(CommandData&& data, u64 next_fence) - : data{std::move(data)}, fence{next_fence} {} + explicit CommandDataContainer(CommandData&& data_, u64 next_fence_) + : data{std::move(data_)}, fence{next_fence_} {} CommandData data; u64 fence{}; @@ -104,16 +118,19 @@ struct SynchState final { /// Class used to manage the GPU thread class ThreadManager final { public: - explicit ThreadManager(Core::System& system); + explicit ThreadManager(Core::System& system_, bool is_async_); ~ThreadManager(); /// Creates and starts the GPU thread. void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher); + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher); /// Push GPU command entries to be processed void SubmitList(Tegra::CommandList&& entries); + /// Push GPU CDMA command buffer entries to be processed + void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries); + /// Swap buffers (render frame) void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); @@ -135,11 +152,12 @@ private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data); -private: - SynchState state; Core::System& system; + const bool is_async; + VideoCore::RasterizerInterface* rasterizer = nullptr; + + SynchState state; std::thread thread; - std::thread::id thread_id; }; } // namespace VideoCommon::GPUThread diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h index 99450777e..21e569ba1 100644 --- a/src/video_core/guest_driver.h +++ b/src/video_core/guest_driver.h @@ -19,8 +19,8 @@ namespace VideoCore { class GuestDriverProfile { public: explicit GuestDriverProfile() = default; - explicit GuestDriverProfile(std::optional<u32> texture_handler_size) - : texture_handler_size{texture_handler_size} {} + explicit GuestDriverProfile(std::optional<u32> texture_handler_size_) + : texture_handler_size{texture_handler_size_} {} void DeduceTextureHandlerSize(std::vector<u32> bound_offsets); diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index aa62363a7..970120acc 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,38 +1,83 @@ set(SHADER_FILES + block_linear_unswizzle_2d.comp + block_linear_unswizzle_3d.comp + convert_depth_to_float.frag + convert_float_to_depth.frag + full_screen_triangle.vert + opengl_copy_bc4.comp opengl_present.frag opengl_present.vert + pitch_unswizzle.comp + vulkan_blit_color_float.frag + vulkan_blit_depth_stencil.frag + vulkan_present.frag + vulkan_present.vert + vulkan_quad_indexed.comp + vulkan_uint8.comp ) +find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED) + +set(GLSL_FLAGS "") +set(QUIET_FLAG "--quiet") + set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) +set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders) set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE) -set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders) -add_custom_command( - OUTPUT - ${SHADER_DIR} +set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in) +set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) + +# Check if `--quiet` is available on host's glslangValidator version +# glslangValidator prints to STDERR iff an unrecognized flag is passed to it +execute_process( COMMAND - ${CMAKE_COMMAND} -E make_directory ${SHADER_DIR} + ${GLSLANGVALIDATOR} ${QUIET_FLAG} + ERROR_VARIABLE + GLSLANG_ERROR + # STDOUT variable defined to silence unnecessary output during CMake configuration + OUTPUT_VARIABLE + GLSLANG_OUTPUT ) -set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in) -set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) +if (NOT GLSLANG_ERROR STREQUAL "") + message(WARNING "Refusing to use unavailable flag `${QUIET_FLAG}` on `${GLSLANGVALIDATOR}`") + set(QUIET_FLAG "") +endif() foreach(FILENAME IN ITEMS ${SHADER_FILES}) string(REPLACE "." "_" SHADER_NAME ${FILENAME}) set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}) - set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) - add_custom_command( - OUTPUT - ${HEADER_FILE} - COMMAND - ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE} - MAIN_DEPENDENCY - ${SOURCE_FILE} - DEPENDS - ${HEADER_GENERATOR} - ${INPUT_FILE} - ) - set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE}) + # Skip generating source headers on Vulkan exclusive files + if (NOT ${FILENAME} MATCHES "vulkan.*") + set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) + add_custom_command( + OUTPUT + ${SOURCE_HEADER_FILE} + COMMAND + ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + DEPENDS + ${INPUT_FILE} + # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE}) + endif() + # Skip compiling to SPIR-V OpenGL exclusive files + if (NOT ${FILENAME} MATCHES "opengl.*") + string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME) + set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h) + add_custom_command( + OUTPUT + ${SPIRV_HEADER_FILE} + COMMAND + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE}) + endif() endforeach() add_custom_target(host_shaders diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake index 368bce0ed..c0fc49768 100644 --- a/src/video_core/host_shaders/StringShaderHeader.cmake +++ b/src/video_core/host_shaders/StringShaderHeader.cmake @@ -8,4 +8,6 @@ string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) file(READ ${SOURCE_FILE} CONTENTS) +get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) +make_directory(${OUTPUT_DIR}) configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY) diff --git a/src/video_core/host_shaders/block_linear_unswizzle_2d.comp b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp new file mode 100644 index 000000000..a131be79e --- /dev/null +++ b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp @@ -0,0 +1,122 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec3 origin; +UNIFORM(1) ivec3 destination; +UNIFORM(2) uint bytes_per_block_log2; +UNIFORM(3) uint layer_stride; +UNIFORM(4) uint block_size; +UNIFORM(5) uint x_shift; +UNIFORM(6) uint block_height; +UNIFORM(7) uint block_height_mask; +END_PUSH_CONSTANTS + +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image; + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); + +uint SwizzleOffset(uvec2 pos) { + pos = pos & SWIZZLE_MASK; + return swizzle_table[pos.y * 64 + pos.x]; +} + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block_log2) { +#if HAS_EXTENDED_TYPES + case 0: + return uvec4(u8data[offset], 0, 0, 0); + case 1: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 0: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 2: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 3: + return uvec4(u64data[offset / 8], 0, 0); + case 4: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec3 pos = gl_GlobalInvocationID + origin; + pos.x <<= bytes_per_block_log2; + + // Read as soon as possible due to its latency + const uint swizzle = SwizzleOffset(pos.xy); + + const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + + uint offset = 0; + offset += pos.z * layer_stride; + offset += (block_y >> block_height) * block_size; + offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; + offset += swizzle; + + const uvec4 texel = ReadTexel(offset); + const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp new file mode 100644 index 000000000..bb6872e6b --- /dev/null +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp @@ -0,0 +1,125 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec3 origin; +UNIFORM(1) ivec3 destination; +UNIFORM(2) uint bytes_per_block_log2; +UNIFORM(3) uint slice_size; +UNIFORM(4) uint block_size; +UNIFORM(5) uint x_shift; +UNIFORM(6) uint block_height; +UNIFORM(7) uint block_height_mask; +UNIFORM(8) uint block_depth; +UNIFORM(9) uint block_depth_mask; +END_PUSH_CONSTANTS + +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image; + +layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); + +uint SwizzleOffset(uvec2 pos) { + pos = pos & SWIZZLE_MASK; + return swizzle_table[pos.y * 64 + pos.x]; +} + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block_log2) { +#if HAS_EXTENDED_TYPES + case 0: + return uvec4(u8data[offset], 0, 0, 0); + case 1: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 0: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 2: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 3: + return uvec4(u64data[offset / 8], 0, 0); + case 4: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec3 pos = gl_GlobalInvocationID + origin; + pos.x <<= bytes_per_block_log2; + + // Read as soon as possible due to its latency + const uint swizzle = SwizzleOffset(pos.xy); + + const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + + uint offset = 0; + offset += (pos.z >> block_depth) * slice_size; + offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height); + offset += (block_y >> block_height) * block_size; + offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; + offset += swizzle; + + const uvec4 texel = ReadTexel(offset); + const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/convert_depth_to_float.frag b/src/video_core/host_shaders/convert_depth_to_float.frag new file mode 100644 index 000000000..624c58509 --- /dev/null +++ b/src/video_core/host_shaders/convert_depth_to_float.frag @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D depth_texture; +layout(location = 0) out float output_color; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + output_color = texelFetch(depth_texture, coord, 0).r; +} diff --git a/src/video_core/host_shaders/convert_float_to_depth.frag b/src/video_core/host_shaders/convert_float_to_depth.frag new file mode 100644 index 000000000..d86c795f4 --- /dev/null +++ b/src/video_core/host_shaders/convert_float_to_depth.frag @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D color_texture; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + float color = texelFetch(color_texture, coord, 0).r; + gl_FragDepth = color; +} diff --git a/src/video_core/host_shaders/full_screen_triangle.vert b/src/video_core/host_shaders/full_screen_triangle.vert new file mode 100644 index 000000000..452ad6502 --- /dev/null +++ b/src/video_core/host_shaders/full_screen_triangle.vert @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +#ifdef VULKAN +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) vec2 tex_scale; +UNIFORM(1) vec2 tex_offset; +END_PUSH_CONSTANTS + +layout(location = 0) out vec2 texcoord; + +void main() { + float x = float((gl_VertexIndex & 1) << 2); + float y = float((gl_VertexIndex & 2) << 1); + gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0); + texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset); +} diff --git a/src/video_core/host_shaders/opengl_copy_bc4.comp b/src/video_core/host_shaders/opengl_copy_bc4.comp new file mode 100644 index 000000000..7b8e20fbe --- /dev/null +++ b/src/video_core/host_shaders/opengl_copy_bc4.comp @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 core +#extension GL_ARB_gpu_shader_int64 : require + +layout (local_size_x = 4, local_size_y = 4) in; + +layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input; +layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output; + +layout(location = 0) uniform uvec3 src_offset; +layout(location = 1) uniform uvec3 dst_offset; + +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt +uint DecompressBlock(uint64_t bits, uvec2 coord) { + const uint code_offset = 16 + 3 * (4 * coord.y + coord.x); + const uint code = uint(bits >> code_offset) & 7; + const uint red0 = uint(bits >> 0) & 0xff; + const uint red1 = uint(bits >> 8) & 0xff; + if (red0 > red1) { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (6 * red0 + 1 * red1) / 7; + case 3: + return (5 * red0 + 2 * red1) / 7; + case 4: + return (4 * red0 + 3 * red1) / 7; + case 5: + return (3 * red0 + 4 * red1) / 7; + case 6: + return (2 * red0 + 5 * red1) / 7; + case 7: + return (1 * red0 + 6 * red1) / 7; + } + } else { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (4 * red0 + 1 * red1) / 5; + case 3: + return (3 * red0 + 2 * red1) / 5; + case 4: + return (2 * red0 + 3 * red1) / 5; + case 5: + return (1 * red0 + 4 * red1) / 5; + case 6: + return 0; + case 7: + return 0xff; + } + } + return 0; +} + +void main() { + uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg; + uint64_t bits = packUint2x32(packed_bits); + uint red = DecompressBlock(bits, gl_LocalInvocationID.xy); + uvec4 color = uvec4(red & 0xff, 0, 0, 0xff); + imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color); +} diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag index 8a4cb024b..84b818227 100644 --- a/src/video_core/host_shaders/opengl_present.frag +++ b/src/video_core/host_shaders/opengl_present.frag @@ -1,3 +1,7 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + #version 430 core layout (location = 0) in vec2 frag_tex_coord; diff --git a/src/video_core/host_shaders/opengl_present.vert b/src/video_core/host_shaders/opengl_present.vert index 2235d31a4..c3b5adbba 100644 --- a/src/video_core/host_shaders/opengl_present.vert +++ b/src/video_core/host_shaders/opengl_present.vert @@ -1,3 +1,7 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + #version 430 core out gl_PerVertex { diff --git a/src/video_core/host_shaders/pitch_unswizzle.comp b/src/video_core/host_shaders/pitch_unswizzle.comp new file mode 100644 index 000000000..cb48ec170 --- /dev/null +++ b/src/video_core/host_shaders/pitch_unswizzle.comp @@ -0,0 +1,86 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec2 origin; +UNIFORM(1) ivec2 destination; +UNIFORM(2) uint bytes_per_block; +UNIFORM(3) uint pitch; +END_PUSH_CONSTANTS + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image; + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block) { +#if HAS_EXTENDED_TYPES + case 1: + return uvec4(u8data[offset], 0, 0, 0); + case 2: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 2: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 4: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 8: + return uvec4(u64data[offset / 8], 0, 0); + case 16: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec2 pos = gl_GlobalInvocationID.xy + origin; + + uint offset = 0; + offset += pos.x * bytes_per_block; + offset += pos.y * pitch; + + const uvec4 texel = ReadTexel(offset); + const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/vulkan_blit_color_float.frag b/src/video_core/host_shaders/vulkan_blit_color_float.frag new file mode 100644 index 000000000..4a6aae410 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_blit_color_float.frag @@ -0,0 +1,14 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D tex; + +layout(location = 0) in vec2 texcoord; +layout(location = 0) out vec4 color; + +void main() { + color = textureLod(tex, texcoord, 0); +} diff --git a/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag new file mode 100644 index 000000000..19bb23a5a --- /dev/null +++ b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag @@ -0,0 +1,16 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 +#extension GL_ARB_shader_stencil_export : require + +layout(binding = 0) uniform sampler2D depth_tex; +layout(binding = 1) uniform isampler2D stencil_tex; + +layout(location = 0) in vec2 texcoord; + +void main() { + gl_FragDepth = textureLod(depth_tex, texcoord, 0).r; + gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r; +} diff --git a/src/video_core/renderer_vulkan/shaders/blit.frag b/src/video_core/host_shaders/vulkan_present.frag index a06ecd24a..0979ff3e6 100644 --- a/src/video_core/renderer_vulkan/shaders/blit.frag +++ b/src/video_core/host_shaders/vulkan_present.frag @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (location = 0) in vec2 frag_tex_coord; diff --git a/src/video_core/renderer_vulkan/shaders/blit.vert b/src/video_core/host_shaders/vulkan_present.vert index c64d9235a..00b868958 100644 --- a/src/video_core/renderer_vulkan/shaders/blit.vert +++ b/src/video_core/host_shaders/vulkan_present.vert @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (location = 0) in vec2 vert_position; diff --git a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp b/src/video_core/host_shaders/vulkan_quad_indexed.comp index 5a472ba9b..8655591d0 100644 --- a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp +++ b/src/video_core/host_shaders/vulkan_quad_indexed.comp @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V quad_indexed.comp -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (local_size_x = 1024) in; diff --git a/src/video_core/renderer_vulkan/shaders/uint8.comp b/src/video_core/host_shaders/vulkan_uint8.comp index a320f3ae0..872291670 100644 --- a/src/video_core/renderer_vulkan/shaders/uint8.comp +++ b/src/video_core/host_shaders/vulkan_uint8.comp @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require @@ -25,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer { uint16_t output_indexes[]; }; +uint AssembleIndex(uint id) { + // Most primitive restart indices are 0xFF + // Hardcode this to 0xFF for now + uint index = uint(input_indexes[id]); + return index == 0xFF ? 0xFFFF : index; +} + void main() { uint id = gl_GlobalInvocationID.x; if (id < input_indexes.length()) { - output_indexes[id] = uint16_t(input_indexes[id]); + output_indexes[id] = uint16_t(AssembleIndex(id)); } } diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index a50e7b4e0..cd21a2112 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -36,7 +36,7 @@ void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, } } else { // Macro not compiled, check if it's uploaded and if so, compile it - std::optional<u32> mid_method = std::nullopt; + std::optional<u32> mid_method; const auto macro_code = uploaded_macro_code.find(method); if (macro_code == uploaded_macro_code.end()) { for (const auto& [method_base, code] : uploaded_macro_code) { diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index df00b57df..70ac7c620 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -85,7 +85,7 @@ constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{ {0x0217920100488FF7, &HLE_0217920100488FF7}, }}; -HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {} HLEMacro::~HLEMacro() = default; std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const { @@ -99,8 +99,8 @@ std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) co HLEMacroImpl::~HLEMacroImpl() = default; -HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) - : maxwell3d(maxwell3d), func(func) {} +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_) + : maxwell3d{maxwell3d_}, func{func_} {} void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) { func(maxwell3d, parameters); diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h index 37af875a0..cb3bd1600 100644 --- a/src/video_core/macro/macro_hle.h +++ b/src/video_core/macro/macro_hle.h @@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u3 class HLEMacro { public: - explicit HLEMacro(Engines::Maxwell3D& maxwell3d); + explicit HLEMacro(Engines::Maxwell3D& maxwell3d_); ~HLEMacro(); std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const; diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index bd01fd1f2..8da26fd59 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,29 +11,29 @@ MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) - : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_) + : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); } -MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, - const std::vector<u32>& code) - : maxwell3d(maxwell3d), code(code) {} +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, + const std::vector<u32>& code_) + : maxwell3d{maxwell3d_}, code{code_} {} -void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) { +void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); - registers[1] = parameters[0]; - num_parameters = parameters.size(); + registers[1] = params[0]; + num_parameters = params.size(); if (num_parameters > parameters_capacity) { parameters_capacity = num_parameters; - this->parameters = std::make_unique<u32[]>(num_parameters); + parameters = std::make_unique<u32[]>(num_parameters); } - std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); + std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32)); // Execute the code until we hit an exit condition. bool keep_executing = true; @@ -133,8 +133,7 @@ bool MacroInterpreterImpl::Step(bool is_delay_slot) { break; } default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", - static_cast<u32>(opcode.operation.Value())); + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value()); } // An instruction with the Exit flag will not actually @@ -182,7 +181,7 @@ u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, return ~(src_a & src_b); default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation); return 0; } } @@ -230,7 +229,7 @@ void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 r Send((result >> 12) & 0b111111); break; default: - UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation); } } diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 90217fc89..d50c619ce 100644 --- a/src/video_core/macro/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -17,7 +17,7 @@ class Maxwell3D; class MacroInterpreter final : public MacroEngine { public: - explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); + explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_); protected: std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; @@ -28,8 +28,8 @@ private: class MacroInterpreterImpl : public CachedMacro { public: - MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); - void Execute(const std::vector<u32>& parameters, u32 method) override; + explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_); + void Execute(const std::vector<u32>& params, u32 method) override; private: /// Resets the execution engine state, zeroing registers, etc. @@ -38,9 +38,9 @@ private: /** * Executes a single macro instruction located at the current program counter. Returns whether * the interpreter should keep running. - * @param offset Offset to start execution at. + * * @param is_delay_slot Whether the current step is being executed due to a delay slot in a - * previous instruction. + * previous instruction. */ bool Step(bool is_delay_slot); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index c1b9e4ad9..c6b2b2109 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -14,11 +14,11 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); namespace Tegra { -static const Xbyak::Reg64 STATE = Xbyak::util::rbx; -static const Xbyak::Reg32 RESULT = Xbyak::util::ebp; -static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; -static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; -static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; +constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx; +constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp; +constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; +constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ STATE, @@ -28,15 +28,15 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ BRANCH_HOLDER, }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) - : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_) + : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { return std::make_unique<MacroJITx64Impl>(maxwell3d, code); } -MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) - : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_) + : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} { Compile(); } @@ -165,8 +165,7 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { } break; default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", - static_cast<std::size_t>(opcode.alu_operation.Value())); + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value()); break; } Compile_ProcessResult(opcode.result_operation, opcode.dst); @@ -553,15 +552,15 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { } void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { - const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) { + const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) { // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero // register. - if (reg == 0) { + if (reg_index == 0) { return; } - mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); + mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result); }; - const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); }; switch (operation) { case Macro::ResultOperation::IgnoreAndFetch: @@ -604,7 +603,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3 Compile_Send(RESULT); break; default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation); } } diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index a180e7428..7f50ac2f8 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -23,7 +23,7 @@ constexpr size_t MAX_CODE_SIZE = 0x10000; class MacroJITx64 final : public MacroEngine { public: - explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_); protected: std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; @@ -34,7 +34,7 @@ private: class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { public: - MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_); ~MacroJITx64Impl(); void Execute(const std::vector<u32>& parameters, u32 method) override; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 16b2aaa27..44240a9c4 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -4,6 +4,7 @@ #include "common/alignment.h" #include "common/assert.h" +#include "common/logging/log.h" #include "core/core.h" #include "core/hle/kernel/memory/page_table.h" #include "core/hle/kernel/process.h" @@ -11,6 +12,7 @@ #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h" namespace Tegra { @@ -19,8 +21,8 @@ MemoryManager::MemoryManager(Core::System& system_) MemoryManager::~MemoryManager() = default; -void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) { @@ -37,6 +39,12 @@ GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std } GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) { + const auto it = std::ranges::lower_bound(map_ranges, gpu_addr, {}, &MapRange::first); + if (it != map_ranges.end() && it->first == gpu_addr) { + it->second = size; + } else { + map_ranges.insert(it, MapRange{gpu_addr, size}); + } return UpdateRange(gpu_addr, cpu_addr, size); } @@ -44,13 +52,28 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_ return Map(cpu_addr, *FindFreeRange(size, align), size); } +GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) { + const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true); + ASSERT(gpu_addr); + return Map(cpu_addr, *gpu_addr, size); +} + void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { - if (!size) { + if (size == 0) { return; } - + const auto it = std::ranges::lower_bound(map_ranges, gpu_addr, {}, &MapRange::first); + if (it != map_ranges.end()) { + ASSERT(it->first == gpu_addr); + map_ranges.erase(it); + } else { + UNREACHABLE_MSG("Unmapping non-existent GPU address=0x{:x}", gpu_addr); + } // Flush and invalidate through the GPU interface, to be asynchronous if possible. - system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size); + const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); + + rasterizer->UnmapMemory(*cpu_addr, size); UpdateRange(gpu_addr, PageEntry::State::Unmapped, size); } @@ -58,7 +81,7 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) { for (u64 offset{}; offset < size; offset += page_size) { if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) { - return {}; + return std::nullopt; } } @@ -108,7 +131,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s page_table[PageEntryIndex(gpu_addr)] = page_entry; } -std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const { +std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address) const { if (!align) { align = page_size; } else { @@ -116,7 +140,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size } u64 available_size{}; - GPUVAddr gpu_addr{address_space_start}; + GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start}; while (gpu_addr + available_size < address_space_size) { if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) { available_size += page_size; @@ -135,13 +159,13 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size } } - return {}; + return std::nullopt; } std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const { const auto page_entry{GetPageEntry(gpu_addr)}; if (!page_entry.IsValid()) { - return {}; + return std::nullopt; } return page_entry.ToAddress() + (gpu_addr & page_mask); @@ -207,6 +231,12 @@ const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const { return system.Memory().GetPointer(*address); } +size_t MemoryManager::BytesToMapEnd(GPUVAddr gpu_addr) const noexcept { + auto it = std::ranges::upper_bound(map_ranges, gpu_addr, {}, &MapRange::first); + --it; + return it->second - (gpu_addr - it->first); +} + void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const { std::size_t remaining_size{size}; std::size_t page_index{gpu_src_addr >> page_bits}; @@ -303,17 +333,29 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf } } +void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const { + size_t remaining_size{size}; + size_t page_index{gpu_addr >> page_bits}; + size_t page_offset{gpu_addr & page_mask}; + while (remaining_size > 0) { + const size_t num_bytes{std::min(page_size - page_offset, remaining_size)}; + if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) { + rasterizer->FlushRegion(*page_addr + page_offset, num_bytes); + } + ++page_index; + page_offset = 0; + remaining_size -= num_bytes; + } +} + void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) { std::vector<u8> tmp_buffer(size); ReadBlock(gpu_src_addr, tmp_buffer.data(), size); - WriteBlock(gpu_dest_addr, tmp_buffer.data(), size); -} -void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, - std::size_t size) { - std::vector<u8> tmp_buffer(size); - ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size); - WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size); + // The output block must be flushed in case it has data modified from the GPU. + // Fixes NPC geometry in Zombie Panic in Wonderland DX + FlushRegion(gpu_dest_addr, size); + WriteBlock(gpu_dest_addr, tmp_buffer.data(), size); } bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const { diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 53c8d122a..b3538d503 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -28,7 +28,7 @@ public: }; constexpr PageEntry() = default; - constexpr PageEntry(State state) : state{state} {} + constexpr PageEntry(State state_) : state{state_} {} constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {} [[nodiscard]] constexpr bool IsUnmapped() const { @@ -68,11 +68,11 @@ static_assert(sizeof(PageEntry) == 4, "PageEntry is too large"); class MemoryManager final { public: - explicit MemoryManager(Core::System& system); + explicit MemoryManager(Core::System& system_); ~MemoryManager(); /// Binds a renderer to the memory manager. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; @@ -85,6 +85,9 @@ public: [[nodiscard]] u8* GetPointer(GPUVAddr addr); [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const; + /// Returns the number of bytes until the end of the memory map containing the given GPU address + [[nodiscard]] size_t BytesToMapEnd(GPUVAddr gpu_addr) const noexcept; + /** * ReadBlock and WriteBlock are full read and write operations over virtual * GPU Memory. It's important to use these when GPU memory may not be continuous @@ -107,7 +110,6 @@ public: */ void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); - void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * IsGranularRange checks if a gpu region can be simply read with a pointer. @@ -116,6 +118,7 @@ public: [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); + [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size); [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); void Unmap(GPUVAddr gpu_addr, std::size_t size); @@ -124,17 +127,21 @@ private: [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size); - [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const; + [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address = false) const; void TryLockPage(PageEntry page_entry, std::size_t size); void TryUnlockPage(PageEntry page_entry, std::size_t size); + void FlushRegion(GPUVAddr gpu_addr, size_t size) const; + [[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) { return (gpu_addr >> page_bits) & page_table_mask; } static constexpr u64 address_space_size = 1ULL << 40; static constexpr u64 address_space_start = 1ULL << 32; + static constexpr u64 address_space_start_low = 1ULL << 16; static constexpr u64 page_bits{16}; static constexpr u64 page_size{1 << page_bits}; static constexpr u64 page_mask{page_size - 1}; @@ -147,6 +154,11 @@ private: VideoCore::RasterizerInterface* rasterizer = nullptr; std::vector<PageEntry> page_table; + + using MapRange = std::pair<GPUVAddr, size_t>; + std::vector<MapRange> map_ranges; + + std::vector<std::pair<VAddr, std::size_t>> cache_invalidate_queue; }; } // namespace Tegra diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp deleted file mode 100644 index 9da9fb4ff..000000000 --- a/src/video_core/morton.cpp +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <array> -#include <cstring> -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/morton.h" -#include "video_core/surface.h" -#include "video_core/textures/decoders.h" - -namespace VideoCore { - -using Surface::GetBytesPerPixel; -using Surface::PixelFormat; - -using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, u8*); -using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>; - -template <bool morton_to_linear, PixelFormat format> -static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth, - u32 tile_width_spacing, u8* buffer, u8* addr) { - constexpr u32 bytes_per_pixel = GetBytesPerPixel(format); - - // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual - // pixel values. - constexpr u32 tile_size_x{GetDefaultBlockWidth(format)}; - constexpr u32 tile_size_y{GetDefaultBlockHeight(format)}; - - if constexpr (morton_to_linear) { - Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel, - stride, height, depth, block_height, block_depth, - tile_width_spacing); - } else { - Tegra::Texture::CopySwizzledData((stride + tile_size_x - 1) / tile_size_x, - (height + tile_size_y - 1) / tile_size_y, depth, - bytes_per_pixel, bytes_per_pixel, addr, buffer, false, - block_height, block_depth, tile_width_spacing); - } -} - -static constexpr ConversionArray morton_to_linear_fns = { - MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>, - MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>, - MortonCopy<true, PixelFormat::A8B8G8R8_SINT>, - MortonCopy<true, PixelFormat::A8B8G8R8_UINT>, - MortonCopy<true, PixelFormat::R5G6B5_UNORM>, - MortonCopy<true, PixelFormat::B5G6R5_UNORM>, - MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>, - MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>, - MortonCopy<true, PixelFormat::A2B10G10R10_UINT>, - MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>, - MortonCopy<true, PixelFormat::R8_UNORM>, - MortonCopy<true, PixelFormat::R8_SNORM>, - MortonCopy<true, PixelFormat::R8_SINT>, - MortonCopy<true, PixelFormat::R8_UINT>, - MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>, - MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>, - MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>, - MortonCopy<true, PixelFormat::R16G16B16A16_SINT>, - MortonCopy<true, PixelFormat::R16G16B16A16_UINT>, - MortonCopy<true, PixelFormat::B10G11R11_FLOAT>, - MortonCopy<true, PixelFormat::R32G32B32A32_UINT>, - MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>, - MortonCopy<true, PixelFormat::BC2_UNORM>, - MortonCopy<true, PixelFormat::BC3_UNORM>, - MortonCopy<true, PixelFormat::BC4_UNORM>, - MortonCopy<true, PixelFormat::BC4_SNORM>, - MortonCopy<true, PixelFormat::BC5_UNORM>, - MortonCopy<true, PixelFormat::BC5_SNORM>, - MortonCopy<true, PixelFormat::BC7_UNORM>, - MortonCopy<true, PixelFormat::BC6H_UFLOAT>, - MortonCopy<true, PixelFormat::BC6H_SFLOAT>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>, - MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>, - MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>, - MortonCopy<true, PixelFormat::R32G32B32A32_SINT>, - MortonCopy<true, PixelFormat::R32G32_FLOAT>, - MortonCopy<true, PixelFormat::R32G32_SINT>, - MortonCopy<true, PixelFormat::R32_FLOAT>, - MortonCopy<true, PixelFormat::R16_FLOAT>, - MortonCopy<true, PixelFormat::R16_UNORM>, - MortonCopy<true, PixelFormat::R16_SNORM>, - MortonCopy<true, PixelFormat::R16_UINT>, - MortonCopy<true, PixelFormat::R16_SINT>, - MortonCopy<true, PixelFormat::R16G16_UNORM>, - MortonCopy<true, PixelFormat::R16G16_FLOAT>, - MortonCopy<true, PixelFormat::R16G16_UINT>, - MortonCopy<true, PixelFormat::R16G16_SINT>, - MortonCopy<true, PixelFormat::R16G16_SNORM>, - MortonCopy<true, PixelFormat::R32G32B32_FLOAT>, - MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>, - MortonCopy<true, PixelFormat::R8G8_UNORM>, - MortonCopy<true, PixelFormat::R8G8_SNORM>, - MortonCopy<true, PixelFormat::R8G8_SINT>, - MortonCopy<true, PixelFormat::R8G8_UINT>, - MortonCopy<true, PixelFormat::R32G32_UINT>, - MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>, - MortonCopy<true, PixelFormat::R32_UINT>, - MortonCopy<true, PixelFormat::R32_SINT>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>, - MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>, - MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>, - MortonCopy<true, PixelFormat::BC2_SRGB>, - MortonCopy<true, PixelFormat::BC3_SRGB>, - MortonCopy<true, PixelFormat::BC7_SRGB>, - MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>, - MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>, - MortonCopy<true, PixelFormat::D32_FLOAT>, - MortonCopy<true, PixelFormat::D16_UNORM>, - MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>, - MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>, - MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>, -}; - -static constexpr ConversionArray linear_to_morton_fns = { - MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>, - MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>, - MortonCopy<false, PixelFormat::A8B8G8R8_SINT>, - MortonCopy<false, PixelFormat::A8B8G8R8_UINT>, - MortonCopy<false, PixelFormat::R5G6B5_UNORM>, - MortonCopy<false, PixelFormat::B5G6R5_UNORM>, - MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>, - MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>, - MortonCopy<false, PixelFormat::A2B10G10R10_UINT>, - MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>, - MortonCopy<false, PixelFormat::R8_UNORM>, - MortonCopy<false, PixelFormat::R8_SNORM>, - MortonCopy<false, PixelFormat::R8_SINT>, - MortonCopy<false, PixelFormat::R8_UINT>, - MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>, - MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>, - MortonCopy<false, PixelFormat::R16G16B16A16_SINT>, - MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>, - MortonCopy<false, PixelFormat::R16G16B16A16_UINT>, - MortonCopy<false, PixelFormat::B10G11R11_FLOAT>, - MortonCopy<false, PixelFormat::R32G32B32A32_UINT>, - MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>, - MortonCopy<false, PixelFormat::BC2_UNORM>, - MortonCopy<false, PixelFormat::BC3_UNORM>, - MortonCopy<false, PixelFormat::BC4_UNORM>, - MortonCopy<false, PixelFormat::BC4_SNORM>, - MortonCopy<false, PixelFormat::BC5_UNORM>, - MortonCopy<false, PixelFormat::BC5_SNORM>, - MortonCopy<false, PixelFormat::BC7_UNORM>, - MortonCopy<false, PixelFormat::BC6H_UFLOAT>, - MortonCopy<false, PixelFormat::BC6H_SFLOAT>, - // TODO(Subv): Swizzling ASTC formats are not supported - nullptr, - MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>, - MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>, - MortonCopy<false, PixelFormat::R32G32B32A32_SINT>, - MortonCopy<false, PixelFormat::R32G32_FLOAT>, - MortonCopy<false, PixelFormat::R32G32_SINT>, - MortonCopy<false, PixelFormat::R32_FLOAT>, - MortonCopy<false, PixelFormat::R16_FLOAT>, - MortonCopy<false, PixelFormat::R16_UNORM>, - MortonCopy<false, PixelFormat::R16_SNORM>, - MortonCopy<false, PixelFormat::R16_UINT>, - MortonCopy<false, PixelFormat::R16_SINT>, - MortonCopy<false, PixelFormat::R16G16_UNORM>, - MortonCopy<false, PixelFormat::R16G16_FLOAT>, - MortonCopy<false, PixelFormat::R16G16_UINT>, - MortonCopy<false, PixelFormat::R16G16_SINT>, - MortonCopy<false, PixelFormat::R16G16_SNORM>, - MortonCopy<false, PixelFormat::R32G32B32_FLOAT>, - MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>, - MortonCopy<false, PixelFormat::R8G8_UNORM>, - MortonCopy<false, PixelFormat::R8G8_SNORM>, - MortonCopy<false, PixelFormat::R8G8_SINT>, - MortonCopy<false, PixelFormat::R8G8_UINT>, - MortonCopy<false, PixelFormat::R32G32_UINT>, - MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>, - MortonCopy<false, PixelFormat::R32_UINT>, - MortonCopy<false, PixelFormat::R32_SINT>, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>, - MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>, - MortonCopy<false, PixelFormat::BC2_SRGB>, - MortonCopy<false, PixelFormat::BC3_SRGB>, - MortonCopy<false, PixelFormat::BC7_SRGB>, - MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>, - MortonCopy<false, PixelFormat::D32_FLOAT>, - MortonCopy<false, PixelFormat::D16_UNORM>, - MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>, - MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>, - MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>, -}; - -static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) { - switch (mode) { - case MortonSwizzleMode::MortonToLinear: - return morton_to_linear_fns[static_cast<std::size_t>(format)]; - case MortonSwizzleMode::LinearToMorton: - return linear_to_morton_fns[static_cast<std::size_t>(format)]; - } - UNREACHABLE(); - return morton_to_linear_fns[static_cast<std::size_t>(format)]; -} - -void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride, - u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, - u8* buffer, u8* addr) { - GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth, - tile_width_spacing, buffer, addr); -} - -} // namespace VideoCore diff --git a/src/video_core/morton.h b/src/video_core/morton.h deleted file mode 100644 index b714a7e3f..000000000 --- a/src/video_core/morton.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" -#include "video_core/surface.h" - -namespace VideoCore { - -enum class MortonSwizzleMode { MortonToLinear, LinearToMorton }; - -void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride, - u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, - u8* buffer, u8* addr); - -} // namespace VideoCore diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 0d3a88765..203f2af05 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -28,8 +28,8 @@ namespace VideoCommon { template <class QueryCache, class HostCounter> class CounterStreamBase { public: - explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type) - : cache{cache}, type{type} {} + explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_) + : cache{cache_}, type{type_} {} /// Updates the state of the stream, enabling or disabling as needed. void Update(bool enabled) { @@ -91,14 +91,15 @@ private: std::shared_ptr<HostCounter> last; }; -template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter, - class QueryPool> +template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> class QueryCacheBase { public: - explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer) - : system{system}, rasterizer{rasterizer}, streams{{CounterStream{ - static_cast<QueryCache&>(*this), - VideoCore::QueryType::SamplesPassed}}} {} + explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_) + : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, + gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} {} void InvalidateRegion(VAddr addr, std::size_t size) { std::unique_lock lock{mutex}; @@ -118,29 +119,27 @@ public: */ void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) { std::unique_lock lock{mutex}; - auto& memory_manager = system.GPU().MemoryManager(); - const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); - ASSERT(cpu_addr_opt); - VAddr cpu_addr = *cpu_addr_opt; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); - CachedQuery* query = TryGet(cpu_addr); + CachedQuery* query = TryGet(*cpu_addr); if (!query) { - ASSERT_OR_EXECUTE(cpu_addr_opt, return;); - const auto host_ptr = memory_manager.GetPointer(gpu_addr); + ASSERT_OR_EXECUTE(cpu_addr, return;); + u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - query = Register(type, cpu_addr, host_ptr, timestamp.has_value()); + query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); } query->BindCounter(Stream(type).Current(), timestamp); if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - AsyncFlushQuery(cpu_addr); + AsyncFlushQuery(*cpu_addr); } } /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. void UpdateCounters() { std::unique_lock lock{mutex}; - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable); } @@ -206,9 +205,6 @@ public: committed_flushes.pop_front(); } -protected: - std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; - private: /// Flushes a memory range to guest memory and removes it from the cache. void FlushAndRemoveRegion(VAddr addr, std::size_t size) { @@ -270,8 +266,9 @@ private: static constexpr std::uintptr_t PAGE_SIZE = 4096; static constexpr unsigned PAGE_BITS = 12; - Core::System& system; VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::MemoryManager& gpu_memory; std::recursive_mutex mutex; @@ -337,8 +334,8 @@ private: template <class HostCounter> class CachedQueryBase { public: - explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr) - : cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_) + : cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {} virtual ~CachedQueryBase() = default; CachedQueryBase(CachedQueryBase&&) noexcept = default; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 3cbdac8e7..50491b758 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -7,6 +7,7 @@ #include <atomic> #include <functional> #include <optional> +#include <span> #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -32,7 +33,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size class RasterizerInterface { public: - virtual ~RasterizerInterface() {} + virtual ~RasterizerInterface() = default; /// Dispatches a draw invocation virtual void Draw(bool is_indexed, bool is_instanced) = 0; @@ -49,6 +50,10 @@ public: /// Records a GPU query and caches it virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; + /// Signal an uniform buffer binding + virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) = 0; + /// Signal a GPU based semaphore as a fence virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0; @@ -76,6 +81,9 @@ public: /// Sync memory between guest and host. virtual void SyncGuestHost() = 0; + /// Unmap memory range + virtual void UnmapMemory(VAddr addr, u64 size) = 0; + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// and invalidated virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; @@ -83,6 +91,12 @@ public: /// Notify the host renderer to wait for previous primitive and compute operations. virtual void WaitForIdle() = 0; + /// Notify the host renderer to wait for reads and writes to render targets and flush caches. + virtual void FragmentBarrier() = 0; + + /// Notify the host renderer to make available previous render target writes. + virtual void TiledCacheBarrier() = 0; + /// Notify the rasterizer to send all written commands to the host GPU. virtual void FlushCommands() = 0; @@ -90,15 +104,15 @@ public: virtual void TickFrame() = 0; /// Attempt to use a faster method to perform a surface copy - virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, - const Tegra::Engines::Fermi2D::Config& copy_config) { + [[nodiscard]] virtual bool AccelerateSurfaceCopy( + const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Config& copy_config) { return false; } /// Attempt to use a faster method to display the framebuffer to screen - virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, - u32 pixel_stride) { + [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, + VAddr framebuffer_addr, u32 pixel_stride) { return false; } @@ -106,19 +120,16 @@ public: virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {} /// Initialize disk cached resources for the game being emulated - virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, - const DiskResourceLoadCallback& callback = {}) {} - - /// Initializes renderer dirty flags - virtual void SetupDirtyFlags() {} + virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + const DiskResourceLoadCallback& callback) {} /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - GuestDriverProfile& AccessGuestDriverProfile() { + [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() { return guest_driver_profile; } /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - const GuestDriverProfile& AccessGuestDriverProfile() const { + [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const { return guest_driver_profile; } diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 649074acd..320ee8d30 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -37,60 +37,43 @@ public: std::unique_ptr<Core::Frontend::GraphicsContext> context); virtual ~RendererBase(); - /// Initialize the renderer - virtual bool Init() = 0; - - /// Shutdown the renderer - virtual void ShutDown() = 0; - /// Finalize rendering the guest frame and draw into the presentation texture virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; - /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer - /// specific implementation) - /// Returns true if a frame was drawn - virtual bool TryPresent(int timeout_ms) = 0; + [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0; // Getter/setter functions: // ------------------------ - f32 GetCurrentFPS() const { + [[nodiscard]] f32 GetCurrentFPS() const { return m_current_fps; } - int GetCurrentFrame() const { + [[nodiscard]] int GetCurrentFrame() const { return m_current_frame; } - RasterizerInterface& Rasterizer() { - return *rasterizer; - } - - const RasterizerInterface& Rasterizer() const { - return *rasterizer; - } - - Core::Frontend::GraphicsContext& Context() { + [[nodiscard]] Core::Frontend::GraphicsContext& Context() { return *context; } - const Core::Frontend::GraphicsContext& Context() const { + [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const { return *context; } - Core::Frontend::EmuWindow& GetRenderWindow() { + [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() { return render_window; } - const Core::Frontend::EmuWindow& GetRenderWindow() const { + [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const { return render_window; } - RendererSettings& Settings() { + [[nodiscard]] RendererSettings& Settings() { return renderer_settings; } - const RendererSettings& Settings() const { + [[nodiscard]] const RendererSettings& Settings() const { return renderer_settings; } @@ -103,7 +86,6 @@ public: protected: Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. - std::unique_ptr<RasterizerInterface> rasterizer; std::unique_ptr<Core::Frontend::GraphicsContext> context; f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer int m_current_frame = 0; ///< Current frame, should be set by the renderer diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index b7e9ed2e9..3e4d88c30 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -39,8 +39,8 @@ using Operation = const OperationNode&; constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; char Swizzle(std::size_t component) { - ASSERT(component < 4); - return component["xyzw"]; + static constexpr std::string_view SWIZZLE{"xyzw"}; + return SWIZZLE.at(component); } constexpr bool IsGenericAttribute(Attribute::Index index) { @@ -71,7 +71,7 @@ std::string_view GetInputFlags(PixelImap attribute) { case PixelImap::Unused: break; } - UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute); return {}; } @@ -123,7 +123,7 @@ std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::Primitive case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: return "TRIANGLES_ADJACENCY"; default: - UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + UNIMPLEMENTED_MSG("topology={}", topology); return "POINTS"; } } @@ -137,7 +137,7 @@ std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { case Tegra::Shader::OutputTopology::TriangleStrip: return "TRIANGLE_STRIP"; default: - UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + UNIMPLEMENTED_MSG("Unknown output topology: {}", topology); return "points"; } } @@ -187,8 +187,8 @@ std::string TextureType(const MetaTexture& meta) { class ARBDecompiler final { public: - explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier); + explicit ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier); std::string Code() const { return shader_source; @@ -224,7 +224,7 @@ private: std::string Visit(const Node& node); - std::pair<std::string, std::size_t> BuildCoords(Operation); + std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation); std::string BuildAoffi(Operation); std::string GlobalMemoryPointer(const GmemNode& gmem); void Exit(); @@ -376,9 +376,11 @@ private: std::string temporary = AllocTemporary(); std::string address; std::string_view opname; + bool robust = false; if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { address = GlobalMemoryPointer(*gmem); opname = "ATOM"; + robust = true; } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); opname = "ATOMS"; @@ -386,7 +388,15 @@ private: UNREACHABLE(); return "{0, 0, 0, 0}"; } + if (robust) { + AddLine("IF NE.x;"); + } AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + if (robust) { + AddLine("ELSE;"); + AddLine("MOV.S {}, 0;", temporary); + AddLine("ENDIF;"); + } return temporary; } @@ -792,9 +802,9 @@ private: }; }; -ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier) - : device{device}, ir{ir}, registry{registry}, stage{stage} { +ARBDecompiler::ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier) + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_} { DefineGlobalMemory(); AddLine("TEMP RC;"); @@ -980,10 +990,9 @@ void ARBDecompiler::DeclareLocalMemory() { } void ARBDecompiler::DeclareGlobalMemory() { - const std::size_t num_entries = ir.GetGlobalMemory().size(); + const size_t num_entries = ir.GetGlobalMemory().size(); if (num_entries > 0) { - const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2; - AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1); + AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1); } } @@ -1125,44 +1134,44 @@ void ARBDecompiler::VisitAST(const ASTNode& node) { for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } - } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) { - const std::string condition = VisitExpression(ast->condition); + } else if (const auto if_then = std::get_if<ASTIfThen>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(if_then->condition); ResetTemporaries(); AddLine("MOVC.U RC.x, {};", condition); AddLine("IF NE.x;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = if_then->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } AddLine("ENDIF;"); - } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) { + } else if (const auto if_else = std::get_if<ASTIfElse>(&*node->GetInnerData())) { AddLine("ELSE;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = if_else->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } - } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { - VisitBlock(ast->nodes); - } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) { - AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + } else if (const auto decoded = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { + VisitBlock(decoded->nodes); + } else if (const auto var_set = std::get_if<ASTVarSet>(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", var_set->index, VisitExpression(var_set->condition)); ResetTemporaries(); - } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { - const std::string condition = VisitExpression(ast->condition); + } else if (const auto do_while = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(do_while->condition); ResetTemporaries(); AddLine("REP;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = do_while->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } AddLine("MOVC.U RC.x, {};", condition); AddLine("BRK (NE.x);"); AddLine("ENDREP;"); - } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) { - const bool is_true = ExprIsTrue(ast->condition); + } else if (const auto ast_return = std::get_if<ASTReturn>(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast_return->condition); if (!is_true) { - AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("MOVC.U RC.x, {};", VisitExpression(ast_return->condition)); AddLine("IF NE.x;"); ResetTemporaries(); } - if (ast->kills) { + if (ast_return->kills) { AddLine("KIL TR;"); } else { Exit(); @@ -1170,11 +1179,11 @@ void ARBDecompiler::VisitAST(const ASTNode& node) { if (!is_true) { AddLine("ENDIF;"); } - } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) { - if (ExprIsTrue(ast->condition)) { + } else if (const auto ast_break = std::get_if<ASTBreak>(&*node->GetInnerData())) { + if (ExprIsTrue(ast_break->condition)) { AddLine("BRK;"); } else { - AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("MOVC.U RC.x, {};", VisitExpression(ast_break->condition)); AddLine("BRK (NE.x);"); ResetTemporaries(); } @@ -1342,7 +1351,7 @@ std::string ARBDecompiler::Visit(const Node& node) { GetGenericAttributeIndex(index), swizzle); } } - UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index)); + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", index); break; } return "{0, 0, 0, 0}.x"; @@ -1363,7 +1372,8 @@ std::string ARBDecompiler::Visit(const Node& node) { if (const auto gmem = std::get_if<GmemNode>(&*node)) { std::string temporary = AllocTemporary(); - AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem)); + AddLine("MOV {}, 0;", temporary); + AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem)); return temporary; } @@ -1406,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) { return {}; } -std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { +std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); UNIMPLEMENTED_IF(meta.sampler.is_indexed); - UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array && - meta.sampler.type == Tegra::Shader::TextureType::TextureCube); + const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube; const std::size_t count = operation.GetOperandsCount(); std::string temporary = AllocVectorTemporary(); std::size_t i = 0; @@ -1419,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); } if (meta.sampler.is_array) { - AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array)); + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array)); + ++i; } if (meta.sampler.is_shadow) { - AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare)); + std::string compare = Visit(meta.depth_compare); + if (is_extended) { + ASSERT(i == 4); + std::string extra_coord = AllocVectorTemporary(); + AddLine("MOV.F {}.x, {};", extra_coord, compare); + return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0}; + } + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare); + ++i; } - return {std::move(temporary), i}; + return {temporary, temporary, i}; } std::string ARBDecompiler::BuildAoffi(Operation operation) { @@ -1441,18 +1460,21 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) { } std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) { + // Read a bindless SSBO, return its address and set CC accordingly + // address = c[binding].xy + // length = c[binding].z const u32 binding = global_memory_names.at(gmem.GetDescriptor()); - const char result_swizzle = binding % 2 == 0 ? 'x' : 'y'; const std::string pointer = AllocLongVectorTemporary(); std::string temporary = AllocTemporary(); - const u32 local_index = binding / 2; - AddLine("PK64.U {}, c[{}];", pointer, local_index); + AddLine("PK64.U {}, c[{}];", pointer, binding); AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()), Visit(gmem.GetBaseAddress())); AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary); - AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer); + AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer); + // Compare offset to length and set CC + AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding); return fmt::format("{}.x", pointer); } @@ -1463,9 +1485,7 @@ void ARBDecompiler::Exit() { } const auto safe_get_register = [this](u32 reg) -> std::string { - // TODO(Rodrigo): Replace with contains once C++20 releases - const auto& used_registers = ir.GetRegisters(); - if (used_registers.find(reg) != used_registers.end()) { + if (ir.GetRegisters().contains(reg)) { return fmt::format("R{}.x", reg); } return "{0, 0, 0, 0}.x"; @@ -1552,7 +1572,9 @@ std::string ARBDecompiler::Assign(Operation operation) { ResetTemporaries(); return {}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { + AddLine("IF NE.x;"); AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem)); + AddLine("ENDIF;"); ResetTemporaries(); return {}; } else { @@ -1844,7 +1866,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) { std::string ARBDecompiler::Texture(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); std::string_view opcode = "TEX"; std::string extra; @@ -1873,7 +1895,7 @@ std::string ARBDecompiler::Texture(Operation operation) { } } - AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id, + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id, TextureType(meta), BuildAoffi(operation)); AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); return fmt::format("{}.x", temporary); @@ -1882,7 +1904,7 @@ std::string ARBDecompiler::Texture(Operation operation) { std::string ARBDecompiler::TextureGather(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); std::string comp; if (!meta.sampler.is_shadow) { @@ -1892,7 +1914,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) { AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, TextureType(meta), BuildAoffi(operation)); - AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element)); return fmt::format("{}.x", temporary); } @@ -1930,13 +1952,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) { std::string ARBDecompiler::TexelFetch(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); if (!meta.sampler.is_buffer) { ASSERT(swizzle < 4); AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); } - AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta), + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta), BuildAoffi(operation)); AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); return fmt::format("{}.x", temporary); @@ -1947,7 +1969,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) { const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; const std::string ddx = AllocVectorTemporary(); const std::string ddy = AllocVectorTemporary(); - const std::string coord = BuildCoords(operation).first; + const std::string coord = std::get<1>(BuildCoords(operation)); const std::size_t num_components = meta.derivates.size() / 2; for (std::size_t index = 0; index < num_components; ++index) { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index e866d8f2f..6da3906a4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -2,98 +2,208 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <memory> +#include <span> -#include <glad/glad.h> - -#include "common/assert.h" -#include "common/microprofile.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { +namespace { +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4); + +constexpr std::array PROGRAM_LUT{ + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, +}; +} // Anonymous namespace + +Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {} + +Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) { + buffer.Create(); + const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr()); + glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data()); + glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW); + + if (runtime.has_unified_vertex_buffers) { + glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address); + } +} -using Maxwell = Tegra::Engines::Maxwell3D::Regs; +void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept { + glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); +} -MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); +void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept { + glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); +} -Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { - gl_buffer.Create(); - glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); - if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +void Buffer::MakeResident(GLenum access) noexcept { + // Abuse GLenum's order to exit early + // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE + if (access <= current_residency_access || buffer.handle == 0) { + return; + } + if (std::exchange(current_residency_access, access) != GL_NONE) { + // If the buffer is already resident, remove its residency before promoting it + glMakeNamedBufferNonResidentNV(buffer.handle); } + glMakeNamedBufferResidentNV(buffer.handle, access); } -Buffer::~Buffer() = default; - -void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { - glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), - data); +BufferCacheRuntime::BufferCacheRuntime(const Device& device_) + : device{device_}, has_fast_buffer_sub_data{device.HasFastBufferSubData()}, + use_assembly_shaders{device.UseAssemblyShaders()}, + has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()}, + stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} { + GLint gl_max_attributes; + glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes); + max_attributes = static_cast<u32>(gl_max_attributes); + for (auto& stage_uniforms : fast_uniforms) { + for (OGLBuffer& buffer : stage_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW); + } + } + for (auto& stage_uniforms : copy_uniforms) { + for (OGLBuffer& buffer : stage_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY); + } + } + for (OGLBuffer& buffer : copy_compute_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY); + } } -void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); - const GLintptr gl_offset = static_cast<GLintptr>(offset); - if (read_buffer.handle == 0) { - read_buffer.Create(); - glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, - GL_STREAM_READ); +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies) { + for (const VideoCommon::BufferCopy& copy : copies) { + glCopyNamedBufferSubData( + src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset), + static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size)); } - glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); - glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); } -void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size) { - glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), - static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) { + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset, + static_cast<GLsizeiptr>(size)); + } else { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle()); + index_buffer_offset = offset; + } } -OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device_, std::size_t stream_size) - : GenericBufferCache{rasterizer, system, - std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, - device{device_} { - if (!device.HasFastBufferSubData()) { +void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, + u32 stride) { + if (index >= max_attributes) { return; } - - static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); - glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); - for (const GLuint cbuf : cbufs) { - glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index, + buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size)); + } else { + glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset), + static_cast<GLsizei>(stride)); } } -OGLBufferCache::~OGLBufferCache() { - glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); +void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size) { + if (use_assembly_shaders) { + GLuint handle; + if (offset != 0) { + handle = copy_uniforms[stage][binding_index].handle; + glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size); + } else { + handle = buffer.Handle(); + } + glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, + static_cast<GLsizeiptr>(size)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<Buffer>(device, cpu_addr, size); +void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size) { + if (use_assembly_shaders) { + GLuint handle; + if (offset != 0) { + handle = copy_compute_uniforms[binding_index].handle; + glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size); + } else { + handle = buffer.Handle(); + } + glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index, handle, 0, + static_cast<GLsizeiptr>(size)); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { - return {0, 0, 0}; +void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1, + reinterpret_cast<const GLuint*>(&ssbo)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, - std::size_t size) { - DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint cbuf = cbufs[cbuf_cursor++]; +void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1, + reinterpret_cast<const GLuint*>(&ssbo)); + } else if (size == 0) { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0); + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } +} - glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); - return {cbuf, 0, 0}; +void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, + u32 size) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 88fdc0536..d8b20a9af 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -5,76 +5,157 @@ #pragma once #include <array> -#include <memory> +#include <span> +#include "common/alignment.h" #include "common/common_types.h" +#include "common/dynamic_library.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" -namespace Core { -class System; -} - namespace OpenGL { -class Device; -class OGLStreamBuffer; -class RasterizerOpenGL; +class BufferCacheRuntime; -class Buffer : public VideoCommon::BufferBlock { +class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> { public: - explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); - ~Buffer(); + explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr, + u64 size_bytes); + explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams); - void Upload(std::size_t offset, std::size_t size, const u8* data); + void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept; - void Download(std::size_t offset, std::size_t size, u8* data); + void ImmediateDownload(size_t offset, std::span<u8> data) noexcept; - void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size); + void MakeResident(GLenum access) noexcept; - GLuint Handle() const noexcept { - return gl_buffer.handle; + [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept { + return address; } - u64 Address() const noexcept { - return gpu_address; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - OGLBuffer gl_buffer; - OGLBuffer read_buffer; - u64 gpu_address = 0; + GLuint64EXT address = 0; + OGLBuffer buffer; + GLenum current_residency_access = GL_NONE; }; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; -class OGLBufferCache final : public GenericBufferCache { +class BufferCacheRuntime { + friend Buffer; + public: - explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device, std::size_t stream_size); - ~OGLBufferCache(); + static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max(); + + explicit BufferCacheRuntime(const Device& device_); + + void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies); + + void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); + + void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride); + + void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size); + + void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) { + if (use_assembly_shaders) { + const GLuint handle = fast_uniforms[stage][binding_index].handle; + const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); + glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, + fast_uniforms[stage][binding_index].handle, 0, + static_cast<GLsizeiptr>(size)); + } + } - BufferInfo GetEmptyBuffer(std::size_t) override; + void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) { + if (use_assembly_shaders) { + glProgramBufferParametersIuivNV( + PABO_LUT[stage], binding_index, 0, + static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)), + reinterpret_cast<const GLuint*>(data.data())); + } else { + glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0, + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); + } + } - void Acquire() noexcept { - cbuf_cursor = 0; + std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept { + const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size)); + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + return mapped_span; } -protected: - std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; + [[nodiscard]] const GLvoid* IndexOffset() const noexcept { + return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset)); + } - BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; + [[nodiscard]] bool HasFastBufferSubData() const noexcept { + return has_fast_buffer_sub_data; + } private: - static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + static constexpr std::array PABO_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; const Device& device; - std::size_t cbuf_cursor = 0; - std::array<GLuint, NUM_CBUFS> cbufs{}; + bool has_fast_buffer_sub_data = false; + bool use_assembly_shaders = false; + bool has_unified_vertex_buffers = false; + + u32 max_attributes = 0; + + std::optional<StreamBuffer> stream_buffer; + + std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>, + VideoCommon::NUM_STAGES> + fast_uniforms; + std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>, + VideoCommon::NUM_STAGES> + copy_uniforms; + std::array<OGLBuffer, VideoCommon::NUM_COMPUTE_UNIFORM_BUFFERS> copy_compute_uniforms; + + u32 index_buffer_offset = 0; +}; + +struct BufferCacheParams { + using Runtime = OpenGL::BufferCacheRuntime; + using Buffer = OpenGL::Buffer; + + static constexpr bool IS_OPENGL = true; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; + static constexpr bool USE_MEMORY_MAPS = false; }; +using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index e7d95149f..48d5c4a5e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -5,9 +5,11 @@ #include <algorithm> #include <array> #include <cstddef> +#include <cstdlib> #include <cstring> #include <limits> #include <optional> +#include <span> #include <vector> #include <glad/glad.h> @@ -19,35 +21,35 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { - namespace { - // One uniform block is reserved for emulation purposes constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = { +constexpr std::array LIMIT_UBOS = { GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, - GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; - -constexpr std::array LimitSSBOs = { + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS, +}; +constexpr std::array LIMIT_SSBOS = { GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; - -constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, - GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS, - GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; - -constexpr std::array LimitImages = { + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, +}; +constexpr std::array LIMIT_SAMPLERS = { + GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS, +}; +constexpr std::array LIMIT_IMAGES = { GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, - GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS, +}; template <typename T> T GetInteger(GLenum pname) { @@ -76,8 +78,8 @@ std::vector<std::string_view> GetExtensions() { return extensions; } -bool HasExtension(const std::vector<std::string_view>& images, std::string_view extension) { - return std::find(images.begin(), images.end(), extension) != images.end(); +bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) { + return std::ranges::find(extensions, extension) != extensions.end(); } u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { @@ -91,8 +93,8 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { std::array<u32, Tegra::Engines::MaxShaderTypes> max; - std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), - [](GLenum pname) { return GetInteger<u32>(pname); }); + std::ranges::transform(LIMIT_UBOS, max.begin(), + [](GLenum pname) { return GetInteger<u32>(pname); }); return max; } @@ -115,9 +117,10 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; bindings[stage] = { - Extract(base_ubo, num_ubos, total_ubos / NumStages, LimitUBOs[stage]), - Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LimitSSBOs[stage]), - Extract(base_samplers, num_samplers, total_samplers / NumStages, LimitSamplers[stage])}; + Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]), + Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]), + Extract(base_samplers, num_samplers, total_samplers / NumStages, + LIMIT_SAMPLERS[stage])}; } u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS); @@ -130,7 +133,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]); // This is guaranteed to be at least 1. const u32 total_extracted_images = num_images / (NumStages - 1); @@ -142,7 +145,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin continue; } bindings[stage].image = - Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); + Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]); } // Compute doesn't care about any of this. @@ -188,17 +191,24 @@ bool IsASTCSupported() { return true; } +[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); +} } // Anonymous namespace -Device::Device() - : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { +Device::Device() { + if (!GLAD_GL_VERSION_4_6) { + LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available"); + throw std::runtime_error{"Insufficient version"}; + } const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); - const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; + const bool is_intel = vendor == "Intel"; bool disable_fast_buffer_sub_data = false; if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { @@ -208,8 +218,10 @@ Device::Device() disable_fast_buffer_sub_data = true; } - uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); - shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); + max_uniform_buffers = BuildMaxUniformBuffers(); + base_bindings = BuildBaseBindings(); + uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE); @@ -223,8 +235,10 @@ Device::Device() has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); + has_broken_texture_view_formats = is_amd || is_intel; has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + has_debugging_tool_attached = IsDebugToolAttached(extensions); // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive // uniform buffers as "push constants" @@ -235,10 +249,13 @@ Device::Device() GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); + use_driver_cache = is_nvidia; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + LOG_INFO(Render_OpenGL, "Renderer_BrokenTextureViewFormats: {}", + has_broken_texture_view_formats); if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) { LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 8a4b6b9fc..ee053776d 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -10,11 +10,9 @@ namespace OpenGL { -static constexpr u32 EmulationUniformBlockBinding = 0; - -class Device final { +class Device { public: - struct BaseBindings final { + struct BaseBindings { u32 uniform_buffer{}; u32 shader_storage_buffer{}; u32 sampler{}; @@ -36,11 +34,11 @@ public: return GetBaseBindings(static_cast<std::size_t>(shader_type)); } - std::size_t GetUniformBufferAlignment() const { + size_t GetUniformBufferAlignment() const { return uniform_buffer_alignment; } - std::size_t GetShaderStorageBufferAlignment() const { + size_t GetShaderStorageBufferAlignment() const { return shader_storage_alignment; } @@ -96,6 +94,10 @@ public: return has_precise_bug; } + bool HasBrokenTextureViewFormats() const { + return has_broken_texture_view_formats; + } + bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } @@ -104,6 +106,10 @@ public: return has_nv_viewport_array2; } + bool HasDebuggingToolAttached() const { + return has_debugging_tool_attached; + } + bool UseAssemblyShaders() const { return use_assembly_shaders; } @@ -112,14 +118,18 @@ public: return use_asynchronous_shaders; } + bool UseDriverCache() const { + return use_driver_cache; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; - std::size_t uniform_buffer_alignment{}; - std::size_t shader_storage_alignment{}; + size_t uniform_buffer_alignment{}; + size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; u32 max_compute_shared_memory_size{}; @@ -133,10 +143,13 @@ private: bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; + bool has_broken_texture_view_formats{}; bool has_fast_buffer_sub_data{}; bool has_nv_viewport_array2{}; + bool has_debugging_tool_attached{}; bool use_assembly_shaders{}; bool use_asynchronous_shaders{}; + bool use_driver_cache{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 3d2588dd2..151290101 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -11,10 +11,10 @@ namespace OpenGL { -GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {} +GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {} -GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed) - : FenceBase(address, payload, is_stubbed) {} +GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_) + : FenceBase{address_, payload_, is_stubbed_} {} GLInnerFence::~GLInnerFence() = default; @@ -45,11 +45,10 @@ void GLInnerFence::Wait() { glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED); } -FenceManagerOpenGL::FenceManagerOpenGL(Core::System& system, - VideoCore::RasterizerInterface& rasterizer, - TextureCacheOpenGL& texture_cache, - OGLBufferCache& buffer_cache, QueryCache& query_cache) - : GenericFenceManager(system, rasterizer, texture_cache, buffer_cache, query_cache) {} +FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, + Tegra::GPU& gpu_, TextureCache& texture_cache_, + BufferCache& buffer_cache_, QueryCache& query_cache_) + : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { return std::make_shared<GLInnerFence>(value, is_stubbed); diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index 1686cf5c8..e714aa115 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -17,8 +17,8 @@ namespace OpenGL { class GLInnerFence : public VideoCommon::FenceBase { public: - GLInnerFence(u32 payload, bool is_stubbed); - GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed); + explicit GLInnerFence(u32 payload_, bool is_stubbed_); + explicit GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_); ~GLInnerFence(); void Queue(); @@ -32,14 +32,13 @@ private: }; using Fence = std::shared_ptr<GLInnerFence>; -using GenericFenceManager = - VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>; +using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; class FenceManagerOpenGL final : public GenericFenceManager { public: - FenceManagerOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache, - QueryCache& query_cache); + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCache& texture_cache, BufferCache& buffer_cache, + QueryCache& query_cache); protected: Fence CreateFence(u32 value, bool is_stubbed) override; diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp deleted file mode 100644 index b8a512cb6..000000000 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <tuple> -#include <unordered_map> -#include <utility> - -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_opengl/gl_framebuffer_cache.h" - -namespace OpenGL { - -using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using VideoCore::Surface::SurfaceType; - -FramebufferCacheOpenGL::FramebufferCacheOpenGL() = default; - -FramebufferCacheOpenGL::~FramebufferCacheOpenGL() = default; - -GLuint FramebufferCacheOpenGL::GetFramebuffer(const FramebufferCacheKey& key) { - const auto [entry, is_cache_miss] = cache.try_emplace(key); - auto& framebuffer{entry->second}; - if (is_cache_miss) { - framebuffer = CreateFramebuffer(key); - } - return framebuffer.handle; -} - -OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheKey& key) { - OGLFramebuffer framebuffer; - framebuffer.Create(); - - // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs. - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle); - - if (key.zeta) { - const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil; - const GLenum attach_target = stencil ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT; - key.zeta->Attach(attach_target, GL_DRAW_FRAMEBUFFER); - } - - std::size_t num_buffers = 0; - std::array<GLenum, Maxwell::NumRenderTargets> targets; - - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - if (!key.colors[index]) { - targets[index] = GL_NONE; - continue; - } - const GLenum attach_target = GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index); - key.colors[index]->Attach(attach_target, GL_DRAW_FRAMEBUFFER); - - const u32 attachment = (key.color_attachments >> (BitsPerAttachment * index)) & 0b1111; - targets[index] = GL_COLOR_ATTACHMENT0 + attachment; - num_buffers = index + 1; - } - - if (num_buffers > 0) { - glDrawBuffers(static_cast<GLsizei>(num_buffers), std::data(targets)); - } else { - glDrawBuffer(GL_NONE); - } - - return framebuffer; -} - -std::size_t FramebufferCacheKey::Hash() const noexcept { - std::size_t hash = std::hash<View>{}(zeta); - for (const auto& color : colors) { - hash ^= std::hash<View>{}(color); - } - hash ^= static_cast<std::size_t>(color_attachments) << 16; - return hash; -} - -bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(colors, zeta, color_attachments) == - std::tie(rhs.colors, rhs.zeta, rhs.color_attachments); -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h deleted file mode 100644 index 8f698fee0..000000000 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <cstddef> -#include <unordered_map> - -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_texture_cache.h" - -namespace OpenGL { - -constexpr std::size_t BitsPerAttachment = 4; - -struct FramebufferCacheKey { - View zeta; - std::array<View, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors; - u32 color_attachments = 0; - - std::size_t Hash() const noexcept; - - bool operator==(const FramebufferCacheKey& rhs) const noexcept; - - bool operator!=(const FramebufferCacheKey& rhs) const noexcept { - return !operator==(rhs); - } - - void SetAttachment(std::size_t index, u32 attachment) { - color_attachments |= attachment << (BitsPerAttachment * index); - } -}; - -} // namespace OpenGL - -namespace std { - -template <> -struct hash<OpenGL::FramebufferCacheKey> { - std::size_t operator()(const OpenGL::FramebufferCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace OpenGL { - -class FramebufferCacheOpenGL { -public: - FramebufferCacheOpenGL(); - ~FramebufferCacheOpenGL(); - - GLuint GetFramebuffer(const FramebufferCacheKey& key); - -private: - OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key); - - std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index d7ba57aca..acebbf5f4 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -30,12 +30,9 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) - : VideoCommon::QueryCacheBase< - QueryCache, CachedQuery, CounterStream, HostCounter, - std::vector<OGLQuery>>{system, - static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, - gl_rasterizer{gl_rasterizer} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_) + : QueryCacheBase(rasterizer_, maxwell3d_, gpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; @@ -60,10 +57,11 @@ bool QueryCache::AnyCommandQueued() const noexcept { return gl_rasterizer.AnyCommandQueued(); } -HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type) - : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache}, - type{type}, query{cache.AllocateQuery(type)} { +HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_) + : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, query{ + cache.AllocateQuery( + type)} { glBeginQuery(GetTarget(type), query.handle); } @@ -87,11 +85,14 @@ u64 HostCounter::BlockingQuery() const { return static_cast<u64>(value); } -CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) - : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} +CachedQuery::CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_, + u8* host_ptr_) + : CachedQueryBase{cpu_addr_, host_ptr_}, cache{&cache_}, type{type_} {} + +CachedQuery::~CachedQuery() = default; CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept - : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} + : CachedQueryBase(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { cache = rhs.cache; diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index d8e7052a1..7bbe5cfe9 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,10 +26,11 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; -class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, - HostCounter, std::vector<OGLQuery>> { +class QueryCache final + : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); + explicit QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -40,12 +41,13 @@ public: private: RasterizerOpenGL& gl_rasterizer; + std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { public: - explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type); + explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_); ~HostCounter(); void EndQuery(); @@ -60,12 +62,14 @@ private: class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { public: - explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, - u8* host_ptr); - CachedQuery(CachedQuery&& rhs) noexcept; - CachedQuery(const CachedQuery&) = delete; + explicit CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_, + u8* host_ptr_); + ~CachedQuery() override; + CachedQuery(CachedQuery&& rhs) noexcept; CachedQuery& operator=(CachedQuery&& rhs) noexcept; + + CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; void Flush() override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 4af5824cd..418644108 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -25,48 +25,49 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" #include "video_core/shader_cache.h" +#include "video_core/texture_cache/texture_cache.h" namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; +using GLvec4 = std::array<GLfloat, 4>; using Tegra::Engines::ShaderType; using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); -MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100)); +MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100)); namespace { -constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; -constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = - NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; -constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = - NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; +constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; -constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; -constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; +struct TextureHandle { + constexpr TextureHandle(u32 data, bool via_header_index) { + const Tegra::Texture::TextureHandle handle{data}; + image = handle.tic_id; + sampler = via_header_index ? image : handle.tsc_id.Value(); + } + + u32 image; + u32 sampler; +}; template <typename Engine, typename Entry> -Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - ShaderType shader_type, std::size_t index = 0) { +TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry, + ShaderType shader_type, size_t index = 0) { if constexpr (std::is_same_v<Entry, SamplerEntry>) { if (entry.is_separated) { const u32 buffer_1 = entry.buffer; @@ -75,36 +76,16 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry const u32 offset_2 = entry.secondary_offset; const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); - return engine.GetTextureInfo(handle_1 | handle_2); + return TextureHandle(handle_1 | handle_2, via_header_index); } } if (entry.is_bindless) { - const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(handle); - } - - const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); - if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(shader_type, offset); - } else { - return engine.GetTexture(offset); - } -} - -std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { - if (!entry.IsIndirect()) { - return entry.GetSize(); + const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return TextureHandle(raw, via_header_index); } - - if (buffer.size > Maxwell::MaxConstBufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, - Maxwell::MaxConstBufferSize); - return Maxwell::MaxConstBufferSize; - } - - return buffer.size; + const u32 buffer = engine.GetBoundBuffer(); + const u64 offset = (entry.offset + index) * sizeof(u32); + return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } /// Translates hardware transform feedback indices @@ -131,7 +112,7 @@ std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { case 43: return {GL_BACK_SECONDARY_COLOR_NV, 0}; } - UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); + UNIMPLEMENTED_MSG("index={}", index); return {GL_POSITION, 0}; } @@ -139,72 +120,74 @@ void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } -void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) { - if (num_entries == 0) { - return; +ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { + if (entry.is_buffer) { + return ImageViewType::Buffer; } - if (num_entries % 2 == 1) { - pointers[num_entries] = 0; + switch (entry.type) { + case Tegra::Shader::TextureType::Texture1D: + return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D; + case Tegra::Shader::TextureType::Texture2D: + return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D; + case Tegra::Shader::TextureType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::TextureType::TextureCube: + return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube; } - const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2); - glProgramLocalParametersI4uivNV(target, 0, num_vectors, - reinterpret_cast<const GLuint*>(pointers)); + UNREACHABLE(); + return ImageViewType::e2D; } -} // Anonymous namespace - -RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - const Device& device, ScreenInfo& info, - ProgramManager& program_manager, StateTracker& state_tracker) - : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device, - state_tracker}, - shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, - buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, - fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, - screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, - async_shaders{emu_window} { - CheckExtensions(); - - unified_uniform_buffer.Create(); - glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); - - if (device.UseAssemblyShaders()) { - glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); - for (const GLuint cbuf : staging_cbufs) { - glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize), - nullptr, 0); - } +ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) { + switch (entry.type) { + case Tegra::Shader::ImageType::Texture1D: + return ImageViewType::e1D; + case Tegra::Shader::ImageType::Texture1DArray: + return ImageViewType::e1DArray; + case Tegra::Shader::ImageType::Texture2D: + return ImageViewType::e2D; + case Tegra::Shader::ImageType::Texture2DArray: + return ImageViewType::e2DArray; + case Tegra::Shader::ImageType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::ImageType::TextureBuffer: + return ImageViewType::Buffer; } + UNREACHABLE(); + return ImageViewType::e2D; +} + +} // Anonymous namespace +RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory_, const Device& device_, + ScreenInfo& screen_info_, ProgramManager& program_manager_, + StateTracker& state_tracker_) + : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), + kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), + screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), + texture_cache_runtime(device, program_manager, state_tracker), + texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + buffer_cache_runtime(device), + buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), + shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), + query_cache(*this, maxwell3d, gpu_memory), + fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), + async_shaders(emu_window_) { if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); } } -RasterizerOpenGL::~RasterizerOpenGL() { - if (device.UseAssemblyShaders()) { - glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); - } -} - -void RasterizerOpenGL::CheckExtensions() { - if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { - LOG_WARNING( - Render_OpenGL, - "Anisotropic filter is not supported! This can cause graphical issues in some games."); - } -} +RasterizerOpenGL::~RasterizerOpenGL() = default; -void RasterizerOpenGL::SetupVertexFormat() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; +void RasterizerOpenGL::SyncVertexFormats() { + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { return; } flags[Dirty::VertexFormats] = false; - MICROPROFILE_SCOPE(OpenGL_VAO); - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables // the first 16 vertex attributes always, as we don't know which ones are actually used until // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to @@ -217,7 +200,7 @@ void RasterizerOpenGL::SetupVertexFormat() { } flags[Dirty::VertexFormat0 + index] = false; - const auto attrib = gpu.regs.vertex_attrib_format[index]; + const auto attrib = maxwell3d.regs.vertex_attrib_format[index]; const auto gl_index = static_cast<GLuint>(index); // Disable constant attributes. @@ -240,64 +223,14 @@ void RasterizerOpenGL::SetupVertexFormat() { } } -void RasterizerOpenGL::SetupVertexBuffer() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - if (!flags[Dirty::VertexBuffers]) { - return; - } - flags[Dirty::VertexBuffers] = false; - - MICROPROFILE_SCOPE(OpenGL_VB); - - const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); - - // Upload all guest vertex arrays sequentially to our buffer - const auto& regs = gpu.regs; - for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { - if (!flags[Dirty::VertexBuffer0 + index]) { - continue; - } - flags[Dirty::VertexBuffer0 + index] = false; - - const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) { - continue; - } - - const GPUVAddr start = vertex_array.StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end >= start); - - const GLuint gl_index = static_cast<GLuint>(index); - const u64 size = end - start; - if (size == 0) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - if (use_unified_memory) { - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); - } - continue; - } - const auto info = buffer_cache.UploadMemory(start, size); - if (use_unified_memory) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, - info.address + info.offset, size); - } else { - glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); - } - } -} - -void RasterizerOpenGL::SetupVertexInstances() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; +void RasterizerOpenGL::SyncVertexInstances() { + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexInstances]) { return; } flags[Dirty::VertexInstances] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexInstance0 + index]) { continue; @@ -311,26 +244,21 @@ void RasterizerOpenGL::SetupVertexInstances() { } } -GLintptr RasterizerOpenGL::SetupIndexBuffer() { - MICROPROFILE_SCOPE(OpenGL_Index); - const auto& regs = system.GPU().Maxwell3D().regs; - const std::size_t size = CalculateIndexBufferSize(); - const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); - return info.offset; -} - -void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { - MICROPROFILE_SCOPE(OpenGL_Shader); - auto& gpu = system.GPU().Maxwell3D(); +void RasterizerOpenGL::SetupShaders(bool is_indexed) { u32 clip_distances = 0; + std::array<Shader*, Maxwell::MaxShaderStage> shaders{}; + image_view_indices.clear(); + sampler_handles.clear(); + + texture_cache.SynchronizeGraphicsDescriptors(); + for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - const auto& shader_config = gpu.regs.shader_config[index]; + const auto& shader_config = maxwell3d.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { switch (program) { case Maxwell::ShaderProgram::Geometry: program_manager.UseGeometryShader(0); @@ -343,7 +271,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } continue; } - // Currently this stages are not supported in the OpenGL backend. // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL if (program == Maxwell::ShaderProgram::TesselationControl || @@ -352,7 +279,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } Shader* const shader = shader_cache.GetStageProgram(program, async_shaders); - const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { case Maxwell::ShaderProgram::VertexA: @@ -368,14 +294,25 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { default: UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index, shader_config.enable.Value(), shader_config.offset); + break; } // Stage indices are 0 - 5 - const std::size_t stage = index == 0 ? 0 : index - 1; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); - SetupDrawTextures(stage, shader); - SetupDrawImages(stage, shader); + const size_t stage = index == 0 ? 0 : index - 1; + shaders[stage] = shader; + + SetupDrawTextures(shader, stage); + SetupDrawImages(shader, stage); + + buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers); + + buffer_cache.UnbindGraphicsStorageBuffers(stage); + u32 ssbo_index = 0; + for (const auto& buffer : shader->GetEntries().global_memory_entries) { + buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index, + buffer.cbuf_offset, buffer.is_written); + ++ssbo_index; + } // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen @@ -389,146 +326,43 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { ++index; } } - SyncClipEnabled(clip_distances); - gpu.dirty.flags[Dirty::Shaders] = false; -} - -std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - - std::size_t size = 0; - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!regs.vertex_array[index].IsEnabled()) - continue; - - const GPUVAddr start = regs.vertex_array[index].StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); + maxwell3d.dirty.flags[Dirty::Shaders] = false; - size += end - start; - ASSERT(end >= start); - } - - return size; -} - -std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - - return static_cast<std::size_t>(regs.index_array.count) * - static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); -} - -void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, - const VideoCore::DiskResourceLoadCallback& callback) { - shader_cache.LoadDiskCache(stop_loading, callback); -} - -void RasterizerOpenGL::SetupDirtyFlags() { - state_tracker.Initialize(); -} - -void RasterizerOpenGL::ConfigureFramebuffers() { - MICROPROFILE_SCOPE(OpenGL_Framebuffer); - auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) { - return; - } - gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; + buffer_cache.UpdateGraphicsBuffers(is_indexed); - texture_cache.GuardRenderTargets(true); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); - View depth_surface = texture_cache.GetDepthBufferSurface(true); + buffer_cache.BindHostGeometryBuffers(is_indexed); - const auto& regs = gpu.regs; - UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); - - // Bind the framebuffer surfaces - FramebufferCacheKey key; - const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); - for (std::size_t index = 0; index < colors_count; ++index) { - View color_surface{texture_cache.GetColorBufferSurface(index, true)}; - if (!color_surface) { + size_t image_view_index = 0; + size_t texture_index = 0; + size_t image_index = 0; + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { + const Shader* const shader = shaders[stage]; + if (!shader) { continue; } - // Assume that a surface will be written to if it is used as a framebuffer, even - // if the shader doesn't actually write to it. - texture_cache.MarkColorBufferInUse(index); - - key.SetAttachment(index, regs.rt_control.GetMap(index)); - key.colors[index] = std::move(color_surface); + buffer_cache.BindHostStageBuffers(stage); + const auto& base = device.GetBaseBindings(stage); + BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, + texture_index, image_index); } - - if (depth_surface) { - // Assume that a surface will be written to if it is used as a framebuffer, even if - // the shader doesn't actually write to it. - texture_cache.MarkDepthBufferInUse(); - key.zeta = std::move(depth_surface); - } - - texture_cache.GuardRenderTargets(false); - - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } -void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) { - auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - texture_cache.GuardRenderTargets(true); - View color_surface; - - if (using_color) { - // Determine if we have to preserve the contents. - // First we have to make sure all clear masks are enabled. - bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G || - !regs.clear_buffers.B || !regs.clear_buffers.A; - const std::size_t index = regs.clear_buffers.RT; - if (regs.clear_flags.scissor) { - // Then we have to confirm scissor testing clears the whole image. - const auto& scissor = regs.scissor_test[0]; - preserve_contents |= scissor.min_x > 0; - preserve_contents |= scissor.min_y > 0; - preserve_contents |= scissor.max_x < regs.rt[index].width; - preserve_contents |= scissor.max_y < regs.rt[index].height; - } - - color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents); - texture_cache.MarkColorBufferInUse(index); - } - - View depth_surface; - if (using_depth_stencil) { - bool preserve_contents = false; - if (regs.clear_flags.scissor) { - // For depth stencil clears we only have to confirm scissor test covers the whole image. - const auto& scissor = regs.scissor_test[0]; - preserve_contents |= scissor.min_x > 0; - preserve_contents |= scissor.min_y > 0; - preserve_contents |= scissor.max_x < regs.zeta_width; - preserve_contents |= scissor.max_y < regs.zeta_height; - } - - depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); - texture_cache.MarkDepthBufferInUse(); - } - texture_cache.GuardRenderTargets(false); - - FramebufferCacheKey key; - key.colors[0] = std::move(color_surface); - key.zeta = std::move(depth_surface); - - state_tracker.NotifyFramebuffer(); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); +void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + const VideoCore::DiskResourceLoadCallback& callback) { + shader_cache.LoadDiskCache(title_id, stop_loading, callback); } void RasterizerOpenGL::Clear() { - const auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.ShouldExecute()) { + MICROPROFILE_SCOPE(OpenGL_Clears); + if (!maxwell3d.ShouldExecute()) { return; } - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; @@ -537,8 +371,9 @@ void RasterizerOpenGL::Clear() { regs.clear_buffers.A) { use_color = true; - state_tracker.NotifyColorMask0(); - glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, + const GLuint index = regs.clear_buffers.RT; + state_tracker.NotifyColorMask(index); + glColorMaski(index, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); // TODO(Rodrigo): Determine if clamping is used on clears @@ -571,15 +406,15 @@ void RasterizerOpenGL::Clear() { state_tracker.NotifyScissor0(); glDisablei(GL_SCISSOR_TEST, 0); } - UNIMPLEMENTED_IF(regs.clear_flags.viewport); - ConfigureClearFramebuffer(use_color, use_depth || use_stencil); + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UpdateRenderTargets(true); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); if (use_color) { - glClearBufferfv(GL_COLOR, 0, regs.clear_color); + glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); } - if (use_depth && use_stencil) { glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); } else if (use_depth) { @@ -587,115 +422,35 @@ void RasterizerOpenGL::Clear() { } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } - ++num_queued_commands; } void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); - auto& gpu = system.GPU().Maxwell3D(); query_cache.UpdateCounters(); - SyncViewport(); - SyncRasterizeEnable(); - SyncPolygonModes(); - SyncColorMask(); - SyncFragmentColorClampState(); - SyncMultiSampleState(); - SyncDepthTestState(); - SyncDepthClamp(); - SyncStencilTestState(); - SyncBlendState(); - SyncLogicOpState(); - SyncCullMode(); - SyncPrimitiveRestart(); - SyncScissorTest(); - SyncPointState(); - SyncLineState(); - SyncPolygonOffset(); - SyncAlphaTest(); - SyncFramebufferSRGB(); - - buffer_cache.Acquire(); - current_cbuf = 0; - - std::size_t buffer_size = CalculateVertexArraysSize(); - - // Add space for index buffer - if (is_indexed) { - buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize(); - } - - // Uniform space for the 5 shader stages - buffer_size = - Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - - // Prepare the vertex array. - const bool invalidated = buffer_cache.Map(buffer_size); - - if (invalidated) { - // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty - auto& dirty = gpu.dirty.flags; - dirty[Dirty::VertexBuffers] = true; - for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { - dirty[index] = true; - } - } - - // Prepare vertex array format. - SetupVertexFormat(); - - // Upload vertex and index data. - SetupVertexBuffer(); - SetupVertexInstances(); - GLintptr index_buffer_offset = 0; - if (is_indexed) { - index_buffer_offset = SetupIndexBuffer(); - } - - // Setup emulation uniform buffer. - if (!device.UseAssemblyShaders()) { - MaxwellUniformData ubo; - ubo.SetFromRegs(gpu); - const auto info = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, - static_cast<GLsizeiptr>(sizeof(ubo))); - } + SyncState(); // Setup shaders and their used resources. - texture_cache.GuardSamplers(true); - const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); - SetupShaders(primitive_mode); - texture_cache.GuardSamplers(false); - - ConfigureFramebuffers(); - - // Signal the buffer cache that we are not going to upload more things. - buffer_cache.Unmap(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + SetupShaders(is_indexed); + texture_cache.UpdateRenderTargets(false); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); program_manager.BindGraphicsPipeline(); - if (texture_cache.TextureBarrier()) { - glTextureBarrier(); - } - + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); BeginTransformFeedback(primitive_mode); - const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); + const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance); const GLsizei num_instances = - static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); + static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1); if (is_indexed) { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); - const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); - const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); + const GLvoid* const offset = buffer_cache_runtime.IndexOffset(); + const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { glDrawElements(primitive_mode, num_vertices, format, offset); } else if (num_instances == 1 && base_instance == 0) { @@ -714,8 +469,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { base_instance); } } else { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count); if (num_instances == 1 && base_instance == 0) { glDrawArrays(primitive_mode, base_vertex, num_vertices); } else if (base_instance == 0) { @@ -730,30 +485,28 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { ++num_queued_commands; - system.GPU().TickWork(); + gpu.TickWork(); } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - buffer_cache.Acquire(); - current_cbuf = 0; - - auto kernel = shader_cache.GetComputeKernel(code_addr); - program_manager.BindCompute(kernel->GetHandle()); - - SetupComputeTextures(kernel); - SetupComputeImages(kernel); + Shader* const kernel = shader_cache.GetComputeKernel(code_addr); - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + BindComputeTextures(kernel); - SetupComputeConstBuffers(kernel); - SetupComputeGlobalMemory(kernel); - - buffer_cache.Unmap(); - - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& entries = kernel->GetEntries(); + buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers); + buffer_cache.UnbindComputeStorageBuffers(); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_memory_entries) { + buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset, + buffer.is_written); + ++ssbo_index; + } + buffer_cache.UpdateComputeBuffers(); + buffer_cache.BindHostComputeBuffers(); + + const auto& launch_desc = kepler_compute.launch_description; glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -767,6 +520,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, query_cache.Query(gpu_addr, type, timestamp); } +void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size); +} + void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { @@ -774,16 +533,24 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.FlushRegion(addr, size); - buffer_cache.FlushRegion(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.DownloadMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.DownloadMemory(addr, size); + } query_cache.FlushRegion(addr, size); } bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; if (!Settings::IsGPULevelHigh()) { - return buffer_cache.MustFlushRegion(addr, size); + return buffer_cache.IsRegionGpuModified(addr, size); } - return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); + return texture_cache.IsRegionGpuModified(addr, size) || + buffer_cache.IsRegionGpuModified(addr, size); } void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { @@ -791,9 +558,15 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.InvalidateRegion(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } shader_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); } @@ -802,30 +575,47 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.OnCPUWrite(addr, size); shader_cache.OnCPUWrite(addr, size); - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.CachedWriteMemory(addr, size); + } } void RasterizerOpenGL::SyncGuestHost() { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - texture_cache.SyncGuestHost(); - buffer_cache.SyncGuestHost(); shader_cache.SyncGuestHost(); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.FlushCachedWrites(); + } +} + +void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } + shader_cache.OnCPUWrite(addr, size); } void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { - auto& memory_manager{gpu.MemoryManager()}; - memory_manager.Write<u32>(addr, value); + gpu_memory.Write<u32>(addr, value); return; } fence_manager.SignalSemaphore(addr, value); } void RasterizerOpenGL::SignalSyncPoint(u32 value) { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { gpu.IncrementSyncPoint(value); return; @@ -834,7 +624,6 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) { } void RasterizerOpenGL::ReleaseFences() { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { return; } @@ -849,14 +638,15 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { } void RasterizerOpenGL::WaitForIdle() { - // Place a barrier on everything that is not framebuffer related. - // This is related to another flag that is not currently implemented. - glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | - GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | - GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | - GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | - GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | - GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); +} + +void RasterizerOpenGL::FragmentBarrier() { + glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT); +} + +void RasterizerOpenGL::TiledCacheBarrier() { + glTextureBarrier(); } void RasterizerOpenGL::FlushCommands() { @@ -872,288 +662,174 @@ void RasterizerOpenGL::TickFrame() { // Ticking a frame means that buffers will be swapped, calling glFlush implicitly. num_queued_commands = 0; - buffer_cache.TickFrame(); + fence_manager.TickFrame(); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.TickFrame(); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.TickFrame(); + } } -bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, +bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); - texture_cache.DoFermiCopy(src, dst, copy_config); + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.BlitImage(dst, src, copy_config); return true; } bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { - if (!framebuffer_addr) { - return {}; + if (framebuffer_addr == 0) { + return false; } - MICROPROFILE_SCOPE(OpenGL_CacheManagement); - const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)}; - if (!surface) { - return {}; + std::scoped_lock lock{texture_cache.mutex}; + ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; + if (!image_view) { + return false; } - // Verify that the cached surface is the same size and format as the requested framebuffer - const auto& params{surface->GetSurfaceParams()}; - const auto& pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; - ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); - ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); - - if (params.pixel_format != pixel_format) { - LOG_DEBUG(Render_OpenGL, "Framebuffer pixel_format is different"); - } - - screen_info.display_texture = surface->GetTexture(); - screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion; + // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different"); + // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different"); + screen_info.display_texture = image_view->Handle(ImageViewType::e2D); + screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { - static constexpr std::array PARAMETER_LUT = { - GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, - GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, - GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV}; - - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& stages = system.GPU().Maxwell3D().state.shader_stages; - const auto& shader_stage = stages[stage_index]; - const auto& entries = shader->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; - - const auto base_bindings = device.GetBaseBindings(stage_index); - u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; - for (const auto& entry : entries.const_buffers) { - const u32 index = entry.GetIndex(); - const auto& buffer = shader_stage.const_buffers[index]; - SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, - base_unified_offset + index * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + - entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, - base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - const auto& entries = kernel->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - - u32 binding = 0; - for (const auto& entry : entries.const_buffers) { - const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; - const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - Tegra::Engines::ConstBufferInfo buffer; - buffer.address = config.Address(); - buffer.size = config.size; - buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, - use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, - NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, - const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset) { - if (!buffer.enabled) { - // Set values to zero to unbind buffers - if (device.UseAssemblyShaders()) { - glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); - } - return; - } +void RasterizerOpenGL::BindComputeTextures(Shader* kernel) { + image_view_indices.clear(); + sampler_handles.clear(); - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 - // UBO alignment requirements. - const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); + texture_cache.SynchronizeComputeDescriptors(); - const bool fast_upload = !use_unified && device.HasFastBufferSubData(); + SetupComputeTextures(kernel); + SetupComputeImages(kernel); - const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); - const GPUVAddr gpu_addr = buffer.address; - auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillComputeImageViews(indices_span, image_view_ids); - if (device.UseAssemblyShaders()) { - UNIMPLEMENTED_IF(use_unified); - if (info.offset != 0) { - const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); - info.handle = staging_cbuf; - info.offset = 0; + program_manager.BindCompute(kernel->GetHandle()); + size_t image_view_index = 0; + size_t texture_index = 0; + size_t image_index = 0; + BindTextures(kernel->GetEntries(), 0, 0, image_view_index, texture_index, image_index); +} + +void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_texture, + GLuint base_image, size_t& image_view_index, + size_t& texture_index, size_t& image_index) { + const GLuint* const samplers = sampler_handles.data() + texture_index; + const GLuint* const textures = texture_handles.data() + texture_index; + const GLuint* const images = image_handles.data() + image_index; + + const size_t num_samplers = entries.samplers.size(); + for (const auto& sampler : entries.samplers) { + for (size_t i = 0; i < sampler.size; ++i) { + const ImageViewId image_view_id = image_view_ids[image_view_index++]; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(sampler)); + texture_handles[texture_index++] = handle; } - glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); - return; } - - if (use_unified) { - glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, - unified_offset, size); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); + const size_t num_images = entries.images.size(); + for (size_t unit = 0; unit < num_images; ++unit) { + // TODO: Mark as modified + const ImageViewId image_view_id = image_view_ids[image_view_index++]; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(entries.images[unit])); + image_handles[image_index] = handle; + ++image_index; } -} - -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { - static constexpr std::array TARGET_LUT = { - GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, - GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, - }; - - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto& cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; - const auto& entries{shader->GetEntries().global_memory_entries}; - - std::array<GLuint64EXT, 32> pointers; - ASSERT(entries.size() < pointers.size()); - - const bool assembly_shaders = device.UseAssemblyShaders(); - u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; - const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; - const u32 size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); - ++binding; - } - if (assembly_shaders) { - UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size()); - } -} - -void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto& cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; - const auto& entries{kernel->GetEntries().global_memory_entries}; - - std::array<GLuint64EXT, 32> pointers; - ASSERT(entries.size() < pointers.size()); - - u32 binding = 0; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; - const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; - const u32 size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); - ++binding; - } - if (device.UseAssemblyShaders()) { - UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size()); + if (num_samplers > 0) { + glBindSamplers(base_texture, static_cast<GLsizei>(num_samplers), samplers); + glBindTextures(base_texture, static_cast<GLsizei>(num_samplers), textures); } -} - -void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, std::size_t size, - GLuint64EXT* pointer) { - const std::size_t alignment{device.GetShaderStorageBufferAlignment()}; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - if (device.UseAssemblyShaders()) { - *pointer = info.address + info.offset; - } else { - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, - static_cast<GLsizeiptr>(size)); + if (num_images > 0) { + glBindImageTextures(base_image, static_cast<GLsizei>(num_images), images); } } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { - MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& maxwell3d = system.GPU().Maxwell3D(); - u32 binding = device.GetBaseBindings(stage_index).sampler; +void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { + const bool via_header_index = + maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : shader->GetEntries().samplers) { const auto shader_type = static_cast<ShaderType>(stage_index); - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); - SetupTexture(binding++, texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const auto handle = + GetTextureInfo(maxwell3d, via_header_index, entry, shader_type, index); + const Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); + image_view_indices.push_back(handle.image); } } } -void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { - MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& compute = system.GPU().KeplerCompute(); - u32 binding = 0; +void RasterizerOpenGL::SetupComputeTextures(const Shader* kernel) { + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : kernel->GetEntries().samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); - SetupTexture(binding++, texture, entry); + for (size_t i = 0; i < entry.size; ++i) { + const auto handle = + GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute, i); + const Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); + image_view_indices.push_back(handle.image); } } } -void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry) { - const auto view = texture_cache.GetTextureSurface(texture.tic, entry); - if (!view) { - // Can occur when texture addr is null or its memory is unmapped/invalid - glBindSampler(binding, 0); - glBindTextureUnit(binding, 0); - return; - } - const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); - glBindTextureUnit(binding, handle); - if (!view->GetSurfaceParams().IsBuffer()) { - glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); - } -} - -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { - const auto& maxwell3d = system.GPU().Maxwell3D(); - u32 binding = device.GetBaseBindings(stage_index).image; +void RasterizerOpenGL::SetupDrawImages(const Shader* shader, size_t stage_index) { + const bool via_header_index = + maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : shader->GetEntries().images) { - const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); - const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; - SetupImage(binding++, tic, entry); + const auto shader_type = static_cast<ShaderType>(stage_index); + const auto handle = GetTextureInfo(maxwell3d, via_header_index, entry, shader_type); + image_view_indices.push_back(handle.image); } } -void RasterizerOpenGL::SetupComputeImages(Shader* shader) { - const auto& compute = system.GPU().KeplerCompute(); - u32 binding = 0; +void RasterizerOpenGL::SetupComputeImages(const Shader* shader) { + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : shader->GetEntries().images) { - const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; - SetupImage(binding++, tic, entry); + const auto handle = + GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute); + image_view_indices.push_back(handle.image); } } -void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const ImageEntry& entry) { - const auto view = texture_cache.GetImageSurface(tic, entry); - if (!view) { - glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); - return; - } - if (entry.is_written) { - view->MarkAsModified(texture_cache.Tick()); - } - const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat()); +void RasterizerOpenGL::SyncState() { + SyncViewport(); + SyncRasterizeEnable(); + SyncPolygonModes(); + SyncColorMask(); + SyncFragmentColorClampState(); + SyncMultiSampleState(); + SyncDepthTestState(); + SyncDepthClamp(); + SyncStencilTestState(); + SyncBlendState(); + SyncLogicOpState(); + SyncCullMode(); + SyncPrimitiveRestart(); + SyncScissorTest(); + SyncPointState(); + SyncLineState(); + SyncPolygonOffset(); + SyncAlphaTest(); + SyncFramebufferSRGB(); + SyncVertexFormats(); + SyncVertexInstances(); } void RasterizerOpenGL::SyncViewport() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; const bool dirty_viewport = flags[Dirty::Viewports]; const bool dirty_clip_control = flags[Dirty::ClipControl]; @@ -1180,15 +856,17 @@ void RasterizerOpenGL::SyncViewport() { flags[Dirty::ClipControl] = false; bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { + if (regs.viewport_transform[0].scale_y < 0.0f) { flip_y = !flip_y; } if (regs.screen_y_control.y_negate != 0) { flip_y = !flip_y; } - glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, - regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE - : GL_NEGATIVE_ONE_TO_ONE); + const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne; + const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT; + const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE; + state_tracker.ClipControl(origin, depth); + state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0); } if (dirty_viewport) { @@ -1225,25 +903,23 @@ void RasterizerOpenGL::SyncViewport() { } void RasterizerOpenGL::SyncDepthClamp() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::DepthClampEnabled]) { return; } flags[Dirty::DepthClampEnabled] = false; - oglEnable(GL_DEPTH_CLAMP, gpu.regs.view_volume_clip_control.depth_clamp_disabled == 0); + oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0); } void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) { return; } flags[Dirty::ClipDistances] = false; - clip_mask &= gpu.regs.clip_distance_enabled; + clip_mask &= maxwell3d.regs.clip_distance_enabled; if (clip_mask == last_clip_distance_mask) { return; } @@ -1259,9 +935,8 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::CullTest]) { flags[Dirty::CullTest] = false; @@ -1276,26 +951,24 @@ void RasterizerOpenGL::SyncCullMode() { } void RasterizerOpenGL::SyncPrimitiveRestart() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PrimitiveRestart]) { return; } flags[Dirty::PrimitiveRestart] = false; - if (gpu.regs.primitive_restart.enabled) { + if (maxwell3d.regs.primitive_restart.enabled) { glEnable(GL_PRIMITIVE_RESTART); - glPrimitiveRestartIndex(gpu.regs.primitive_restart.index); + glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index); } else { glDisable(GL_PRIMITIVE_RESTART); } } void RasterizerOpenGL::SyncDepthTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; - const auto& regs = gpu.regs; if (flags[Dirty::DepthMask]) { flags[Dirty::DepthMask] = false; glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE); @@ -1313,14 +986,13 @@ void RasterizerOpenGL::SyncDepthTestState() { } void RasterizerOpenGL::SyncStencilTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::StencilTest]) { return; } flags[Dirty::StencilTest] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_STENCIL_TEST, regs.stencil_enable); glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func), @@ -1345,25 +1017,24 @@ void RasterizerOpenGL::SyncStencilTestState() { } void RasterizerOpenGL::SyncRasterizeEnable() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::RasterizeEnable]) { return; } flags[Dirty::RasterizeEnable] = false; - oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); + oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0); } void RasterizerOpenGL::SyncPolygonModes() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonModes]) { return; } flags[Dirty::PolygonModes] = false; - if (gpu.regs.fill_rectangle) { + const auto& regs = maxwell3d.regs; + if (regs.fill_rectangle) { if (!GLAD_GL_NV_fill_rectangle) { LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported"); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); @@ -1376,27 +1047,26 @@ void RasterizerOpenGL::SyncPolygonModes() { return; } - if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) { + if (regs.polygon_mode_front == regs.polygon_mode_back) { flags[Dirty::PolygonModeFront] = false; flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); return; } if (flags[Dirty::PolygonModeFront]) { flags[Dirty::PolygonModeFront] = false; - glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); } if (flags[Dirty::PolygonModeBack]) { flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back)); + glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back)); } } void RasterizerOpenGL::SyncColorMask() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ColorMasks]) { return; } @@ -1405,7 +1075,7 @@ void RasterizerOpenGL::SyncColorMask() { const bool force = flags[Dirty::ColorMaskCommon]; flags[Dirty::ColorMaskCommon] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.color_mask_common) { if (!force && !flags[Dirty::ColorMask0]) { return; @@ -1430,33 +1100,30 @@ void RasterizerOpenGL::SyncColorMask() { } void RasterizerOpenGL::SyncMultiSampleState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::MultisampleControl]) { return; } flags[Dirty::MultisampleControl] = false; - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage); oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one); } void RasterizerOpenGL::SyncFragmentColorClampState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FragmentClampColor]) { return; } flags[Dirty::FragmentClampColor] = false; - glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); + glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); } void RasterizerOpenGL::SyncBlendState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::BlendColor]) { flags[Dirty::BlendColor] = false; @@ -1513,14 +1180,13 @@ void RasterizerOpenGL::SyncBlendState() { } void RasterizerOpenGL::SyncLogicOpState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LogicOp]) { return; } flags[Dirty::LogicOp] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.logic_op.enable) { glEnable(GL_COLOR_LOGIC_OP); glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation)); @@ -1530,14 +1196,13 @@ void RasterizerOpenGL::SyncLogicOpState() { } void RasterizerOpenGL::SyncScissorTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::Scissors]) { return; } flags[Dirty::Scissors] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!flags[Dirty::Scissor0 + index]) { continue; @@ -1556,49 +1221,38 @@ void RasterizerOpenGL::SyncScissorTest() { } void RasterizerOpenGL::SyncPointState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PointSize]) { return; } flags[Dirty::PointSize] = false; - oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable); - - if (gpu.regs.vp_point_size.enable) { - // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. - glEnable(GL_PROGRAM_POINT_SIZE); - return; - } + oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); + oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable); - // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid - // in OpenGL). - glPointSize(std::max(1.0f, gpu.regs.point_size)); - glDisable(GL_PROGRAM_POINT_SIZE); + glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); } void RasterizerOpenGL::SyncLineState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LineWidth]) { return; } flags[Dirty::LineWidth] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable); glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased); } void RasterizerOpenGL::SyncPolygonOffset() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonOffset]) { return; } flags[Dirty::PolygonOffset] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable); oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable); oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable); @@ -1612,18 +1266,13 @@ void RasterizerOpenGL::SyncPolygonOffset() { } void RasterizerOpenGL::SyncAlphaTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::AlphaTest]) { return; } flags[Dirty::AlphaTest] = false; - const auto& regs = gpu.regs; - if (regs.alpha_test_enabled && regs.rt_control.count > 1) { - LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); - } - + const auto& regs = maxwell3d.regs; if (regs.alpha_test_enabled) { glEnable(GL_ALPHA_TEST); glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); @@ -1633,20 +1282,19 @@ void RasterizerOpenGL::SyncAlphaTest() { } void RasterizerOpenGL::SyncFramebufferSRGB() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FramebufferSRGB]) { return; } flags[Dirty::FramebufferSRGB] = false; - oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); + oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb); } void RasterizerOpenGL::SyncTransformFeedback() { // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal // when this is required. - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; static constexpr std::size_t STRIDE = 3; std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs; @@ -1698,40 +1346,17 @@ void RasterizerOpenGL::SyncTransformFeedback() { } void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } - if (device.UseAssemblyShaders()) { SyncTransformFeedback(); } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - if (enabled_transform_feedback_buffers[index]) { - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0, - 0); - } - enabled_transform_feedback_buffers[index] = false; - continue; - } - enabled_transform_feedback_buffers[index] = true; - - auto& tfb_buffer = transform_feedback_buffers[index]; - tfb_buffer.Create(); - - const GLuint handle = tfb_buffer.handle; - const std::size_t size = binding.buffer_size; - glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY); - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0, - static_cast<GLsizeiptr>(size)); - } + UNIMPLEMENTED_IF(primitive_mode != GL_POINTS); // We may have to call BeginTransformFeedbackNV here since they seem to call different // implementations on Nvidia's driver (the pointer is different) but we are using @@ -1741,27 +1366,11 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { } void RasterizerOpenGL::EndTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } - glEndTransformFeedback(); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - continue; - } - UNIMPLEMENTED_IF(binding.buffer_offset != 0); - - const GLuint handle = transform_feedback_buffers[index].handle; - const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, - static_cast<GLsizeiptr>(size)); - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ccc6f50f6..3745cf637 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -7,12 +7,13 @@ #include <array> #include <atomic> #include <cstddef> -#include <map> #include <memory> #include <optional> #include <tuple> #include <utility> +#include <boost/container/static_vector.hpp> + #include <glad/glad.h> #include "common/common_types.h" @@ -23,21 +24,18 @@ #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_fence_manager.h" -#include "video_core/renderer_opengl/gl_framebuffer_cache.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" -#include "video_core/renderer_opengl/utils.h" #include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" -namespace Core { -class System; +namespace Core::Memory { +class Memory; } namespace Core::Frontend { @@ -51,13 +49,21 @@ class MemoryManager; namespace OpenGL { struct ScreenInfo; -struct DrawParameters; +struct ShaderEntries; + +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - const Device& device, ScreenInfo& info, - ProgramManager& program_manager, StateTracker& state_tracker); + explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory_, const Device& device_, + ScreenInfo& screen_info_, ProgramManager& program_manager_, + StateTracker& state_tracker_); ~RasterizerOpenGL() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -65,27 +71,30 @@ public: void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; + void UnmapMemory(VAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void WaitForIdle() override; + void FragmentBarrier() override; + void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void LoadDiskResources(const std::atomic_bool& stop_loading, + void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; - void SetupDirtyFlags() override; /// Returns true when there are commands queued to the OpenGL server. bool AnyCommandQueued() const { @@ -101,51 +110,29 @@ public: } private: - /// Configures the color and depth framebuffer states. - void ConfigureFramebuffers(); - - /// Configures the color and depth framebuffer for clearing. - void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); - - /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); - - /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(Shader* kernel); - - /// Configures a constant buffer. - void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset); + static constexpr size_t MAX_TEXTURES = 192; + static constexpr size_t MAX_IMAGES = 48; + static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES; - /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); + void BindComputeTextures(Shader* kernel); - /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(Shader* kernel); - - /// Configures a global memory buffer. - void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - std::size_t size, GLuint64EXT* pointer); + void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, + size_t& image_view_index, size_t& texture_index, size_t& image_index); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, Shader* shader); + void SetupDrawTextures(const Shader* shader, size_t stage_index); /// Configures the textures used in a compute shader. - void SetupComputeTextures(Shader* kernel); - - /// Configures a texture. - void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry); + void SetupComputeTextures(const Shader* kernel); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, Shader* shader); + void SetupDrawImages(const Shader* shader, size_t stage_index); /// Configures images in a compute shader. - void SetupComputeImages(Shader* shader); + void SetupComputeImages(const Shader* shader); - /// Configures an image. - void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); + /// Syncs state to match guest's + void SyncState(); /// Syncs the viewport and depth range to match the guest state void SyncViewport(); @@ -210,6 +197,12 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs vertex formats to match the guest state + void SyncVertexFormats(); + + /// Syncs vertex instances to match the guest state + void SyncVertexInstances(); + /// Syncs transform feedback state to match guest state /// @note Only valid on assembly shaders void SyncTransformFeedback(); @@ -220,56 +213,35 @@ private: /// End a transform feedback void EndTransformFeedback(); - /// Check for extension that are not strictly required but are needed for correct emulation - void CheckExtensions(); - - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; + void SetupShaders(bool is_indexed); - /// Updates the current vertex format - void SetupVertexFormat(); - - void SetupVertexBuffer(); - void SetupVertexInstances(); - - GLintptr SetupIndexBuffer(); - - void SetupShaders(GLenum primitive_mode); + Tegra::GPU& gpu; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; const Device& device; + ScreenInfo& screen_info; + ProgramManager& program_manager; + StateTracker& state_tracker; - TextureCacheOpenGL texture_cache; + TextureCacheRuntime texture_cache_runtime; + TextureCache texture_cache; + BufferCacheRuntime buffer_cache_runtime; + BufferCache buffer_cache; ShaderCacheOpenGL shader_cache; - SamplerCacheOpenGL sampler_cache; - FramebufferCacheOpenGL framebuffer_cache; QueryCache query_cache; - OGLBufferCache buffer_cache; FenceManagerOpenGL fence_manager; - Core::System& system; - ScreenInfo& screen_info; - ProgramManager& program_manager; - StateTracker& state_tracker; VideoCommon::Shader::AsyncShaders async_shaders; - static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; - - GLint vertex_binding = 0; - - std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> - transform_feedback_buffers; - std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> - enabled_transform_feedback_buffers; - - static constexpr std::size_t NUM_CONSTANT_BUFFERS = - Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; - std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; - std::size_t current_cbuf = 0; - OGLBuffer unified_uniform_buffer; + boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; + std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; + boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles; + std::array<GLuint, MAX_TEXTURES> texture_handles{}; + std::array<GLuint, MAX_IMAGES> image_handles{}; - /// Number of commands queued to the OpenGL driver. Reseted on flush. + /// Number of commands queued to the OpenGL driver. Resetted on flush. std::size_t num_queued_commands = 0; u32 last_clip_distance_mask = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 0ebcec427..3428e5e21 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -71,7 +71,7 @@ void OGLSampler::Create() { return; MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glGenSamplers(1, &handle); + glCreateSamplers(1, &handle); } void OGLSampler::Release() { @@ -171,12 +171,6 @@ void OGLBuffer::Release() { handle = 0; } -void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) { - ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; }); - - glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY); -} - void OGLSync::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index f48398669..552d79db4 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -234,9 +234,6 @@ public: /// Deletes the internal OpenGL resource void Release(); - // Converts the buffer into a stream copy buffer with a fixed size - void MakeStreamCopy(std::size_t buffer_size); - GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.cpp b/src/video_core/renderer_opengl/gl_sampler_cache.cpp deleted file mode 100644 index 5c174879a..000000000 --- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/logging/log.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_sampler_cache.h" -#include "video_core/renderer_opengl/maxwell_to_gl.h" - -namespace OpenGL { - -SamplerCacheOpenGL::SamplerCacheOpenGL() = default; - -SamplerCacheOpenGL::~SamplerCacheOpenGL() = default; - -OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const { - OGLSampler sampler; - sampler.Create(); - - const GLuint sampler_id{sampler.handle}; - glSamplerParameteri( - sampler_id, GL_TEXTURE_MAG_FILTER, - MaxwellToGL::TextureFilterMode(tsc.mag_filter, Tegra::Texture::TextureMipmapFilter::None)); - glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER, - MaxwellToGL::TextureFilterMode(tsc.min_filter, tsc.mipmap_filter)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(tsc.wrap_u)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(tsc.wrap_v)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(tsc.wrap_p)); - glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE, - tsc.depth_compare_enabled == 1 ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE); - glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC, - MaxwellToGL::DepthCompareFunc(tsc.depth_compare_func)); - glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, tsc.GetBorderColor().data()); - glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, tsc.GetMinLod()); - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, tsc.GetMaxLod()); - glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, tsc.GetLodBias()); - if (GLAD_GL_ARB_texture_filter_anisotropic) { - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy()); - } else if (GLAD_GL_EXT_texture_filter_anisotropic) { - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy()); - } else { - LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver"); - } - - return sampler; -} - -GLuint SamplerCacheOpenGL::ToSamplerType(const OGLSampler& sampler) const { - return sampler.handle; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h deleted file mode 100644 index 34ee37f00..000000000 --- a/src/video_core/renderer_opengl/gl_sampler_cache.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <glad/glad.h> - -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/sampler_cache.h" - -namespace OpenGL { - -class SamplerCacheOpenGL final : public VideoCommon::SamplerCache<GLuint, OGLSampler> { -public: - explicit SamplerCacheOpenGL(); - ~SamplerCacheOpenGL(); - -protected: - OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - - GLuint ToSamplerType(const OGLSampler& sampler) const override; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a07d56ef0..529570ff0 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -27,7 +27,6 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/utils.h" #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" @@ -160,6 +159,10 @@ std::unordered_set<GLenum> GetSupportedFormats() { ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, const ShaderIR& ir, const Registry& registry, bool hint_retrievable) { + if (device.UseDriverCache()) { + // Ignore hint retrievable if we are using the driver cache + hint_retrievable = false; + } const std::string shader_id = MakeShaderID(unique_identifier, shader_type); LOG_INFO(Render_OpenGL, "{}", shader_id); @@ -198,10 +201,10 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u return program; } -Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, - ProgramSharedPtr program_, bool is_built) +Shader::Shader(std::shared_ptr<Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_, bool is_built_) : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)}, - is_built(is_built) { + is_built{is_built_} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; @@ -239,12 +242,11 @@ std::unique_ptr<Shader> Shader::CreateStageFromMemory( ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) { const auto shader_type = GetShaderType(program_type); - auto& gpu = params.system.GPU(); + auto& gpu = params.gpu; gpu.ShaderNotify().MarkSharderBuilding(); auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D()); - if (!async_shaders.IsShaderAsync(params.system.GPU()) || - !params.device.UseAsynchronousShaders()) { + if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) { const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); // TODO(Rodrigo): Handle VertexA shaders // std::optional<ShaderIR> ir_b; @@ -287,11 +289,10 @@ std::unique_ptr<Shader> Shader::CreateStageFromMemory( std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { - auto& gpu = params.system.GPU(); + auto& gpu = params.gpu; gpu.ShaderNotify().MarkSharderBuilding(); - auto& engine = gpu.KeplerCompute(); - auto registry = std::make_shared<Registry>(ShaderType::Compute, engine); + auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine); const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); const u64 uid = params.unique_identifier; auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry); @@ -320,22 +321,26 @@ std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device) - : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, - emu_window{emu_window}, device{device}, disk_cache{system} {} +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, + Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_) + : ShaderCache{rasterizer_}, emu_window{emu_window_}, gpu{gpu_}, gpu_memory{gpu_memory_}, + maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, device{device_} {} ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; -void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, +void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { + disk_cache.BindTitleID(title_id); const std::optional transferable = disk_cache.LoadTransferable(); if (!transferable) { return; } std::vector<ShaderDiskCachePrecompiled> gl_cache; - if (!device.UseAssemblyShaders()) { + if (!device.UseAssemblyShaders() && !device.UseDriverCache()) { // Only load precompiled cache when we are not using assembly shaders gl_cache = disk_cache.LoadPrecompiled(); } @@ -355,8 +360,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, std::atomic_bool gl_cache_failed = false; const auto find_precompiled = [&gl_cache](u64 id) { - return std::find_if(gl_cache.begin(), gl_cache.end(), - [id](const auto& entry) { return entry.unique_identifier == id; }); + return std::ranges::find(gl_cache, id, &ShaderDiskCachePrecompiled::unique_identifier); }; const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, @@ -431,8 +435,8 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, return; } - if (device.UseAssemblyShaders()) { - // Don't store precompiled binaries for assembly shaders. + if (device.UseAssemblyShaders() || device.UseDriverCache()) { + // Don't store precompiled binaries for assembly shaders or when using the driver cache return; } @@ -457,7 +461,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats) { - if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { + if (!supported_formats.contains(precompiled_entry.binary_format)) { LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing"); return {}; } @@ -481,21 +485,19 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, VideoCommon::Shader::AsyncShaders& async_shaders) { - if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { + if (!maxwell3d.dirty.flags[Dirty::Shaders]) { auto* last_shader = last_shaders[static_cast<std::size_t>(program)]; if (last_shader->IsBuilt()) { return last_shader; } } - auto& memory_manager{system.GPU().MemoryManager()}; - const GPUVAddr address{GetShaderAddress(system, program)}; + const GPUVAddr address{GetShaderAddress(maxwell3d, program)}; if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) { auto completed_work = async_shaders.GetCompletedWork(); for (auto& work : completed_work) { Shader* shader = TryGet(work.cpu_address); - auto& gpu = system.GPU(); gpu.ShaderNotify().MarkShaderComplete(); if (shader == nullptr) { continue; @@ -507,14 +509,13 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, shader->AsyncGLASMBuilt(std::move(work.program.glasm)); } + auto& registry = shader->GetRegistry(); + ShaderDiskCacheEntry entry; entry.type = work.shader_type; entry.code = std::move(work.code); entry.code_b = std::move(work.code_b); entry.unique_identifier = work.uid; - - auto& registry = shader->GetRegistry(); - entry.bound_buffer = registry.GetBoundBuffer(); entry.graphics_info = registry.GetGraphicsInfo(); entry.keys = registry.GetKeys(); @@ -525,28 +526,28 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, } // Look up shader in the cache based on address - const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)}; if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast<std::size_t>(program)] = shader; } - const auto host_ptr{memory_manager.GetPointer(address)}; + const u8* const host_ptr{gpu_memory.GetPointer(address)}; // No shader found - create a new one - ProgramCode code{GetShaderCode(memory_manager, address, host_ptr, false)}; + ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)}; ProgramCode code_b; if (program == Maxwell::ShaderProgram::VertexA) { - const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; - const u8* host_ptr_b = memory_manager.GetPointer(address_b); - code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false); + const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)}; + const u8* host_ptr_b = gpu_memory.GetPointer(address_b); + code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false); } const std::size_t code_size = code.size() * sizeof(u64); const u64 unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, maxwell3d, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; std::unique_ptr<Shader> shader; const auto found = runtime_cache.find(unique_identifier); @@ -568,21 +569,20 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, } Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { - auto& memory_manager{system.GPU().MemoryManager()}; - const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)}; if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } - const auto host_ptr{memory_manager.GetPointer(code_addr)}; // No kernel found, create a new one - ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)}; + const u8* host_ptr{gpu_memory.GetPointer(code_addr)}; + ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)}; const std::size_t code_size{code.size() * sizeof(u64)}; const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, kepler_compute, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; std::unique_ptr<Shader> kernel; const auto found = runtime_cache.find(unique_identifier); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 7528ac686..2aed0697e 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -25,8 +25,8 @@ #include "video_core/shader/shader_ir.h" #include "video_core/shader_cache.h" -namespace Core { -class System; +namespace Tegra { +class MemoryManager; } namespace Core::Frontend { @@ -57,11 +57,12 @@ struct PrecompiledShader { }; struct ShaderParameters { - Core::System& system; + Tegra::GPU& gpu; + Tegra::Engines::ConstBufferEngineInterface& engine; ShaderDiskCacheOpenGL& disk_cache; const Device& device; VAddr cpu_addr; - u8* host_ptr; + const u8* host_ptr; u64 unique_identifier; }; @@ -107,7 +108,7 @@ public: private: explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, - ProgramSharedPtr program, bool is_built = true); + ProgramSharedPtr program, bool is_built_ = true); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; @@ -118,12 +119,15 @@ private: class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, + Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_); ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game - void LoadDiskCache(const std::atomic_bool& stop_loading, + void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program @@ -138,9 +142,13 @@ private: const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats); - Core::System& system; Core::Frontend::EmuWindow& emu_window; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; const Device& device; + ShaderDiskCacheOpenGL disk_cache; std::unordered_map<u64, PrecompiledShader> runtime_cache; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 3f75fcd2b..ac78d344c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -14,6 +14,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/div_ceil.h" #include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" @@ -38,11 +39,9 @@ using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; using Tegra::Shader::TextureType; -using VideoCommon::Shader::BuildTransformFeedback; -using VideoCommon::Shader::Registry; -using namespace std::string_literals; using namespace VideoCommon::Shader; +using namespace std::string_literals; using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Operation = const OperationNode&; @@ -65,7 +64,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument> constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); -constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt +constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint #define itof intBitsToFloat #define utof uintBitsToFloat @@ -78,10 +77,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{ const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); - -layout (std140, binding = {}) uniform vs_config {{ - float y_direction; -}}; )"; class ShaderWriter final { @@ -131,7 +126,7 @@ private: class Expression final { public: - Expression(std::string code, Type type) : code{std::move(code)}, type{type} { + Expression(std::string code_, Type type_) : code{std::move(code_)}, type{type_} { ASSERT(type != Type::Void); } Expression() : type{Type::Void} {} @@ -148,8 +143,8 @@ public: ASSERT(type == Type::Void); } - std::string As(Type type) const { - switch (type) { + std::string As(Type type_) const { + switch (type_) { case Type::Bool: return AsBool(); case Type::Bool2: @@ -316,7 +311,7 @@ std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology t case Maxwell::PrimitiveTopology::TriangleStripAdjacency: return {"triangles_adjacency", 6}; default: - UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + UNIMPLEMENTED_MSG("topology={}", topology); return {"points", 1}; } } @@ -342,7 +337,7 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { case Tegra::Shader::OutputTopology::TriangleStrip: return "triangle_strip"; default: - UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + UNIMPLEMENTED_MSG("Unknown output topology: {}", topology); return "points"; } } @@ -403,13 +398,6 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } -bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { - const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); - // We waste one UBO for emulation - const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; - return num_ubos > num_available_ubos; -} - struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -418,11 +406,11 @@ struct GenericVaryingDescription { class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, - suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ - UseUnifiedUniforms(device, ir, stage)} { + explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier_, + std::string_view suffix_) + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, + identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -516,7 +504,8 @@ private: if (!identifier.empty()) { code.AddLine("// {}", identifier); } - code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); + const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate(); + code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core"); code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); if (device.HasShaderBallot()) { code.AddLine("#extension GL_ARB_shader_ballot : require"); @@ -542,7 +531,7 @@ private: code.AddNewLine(); - code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); + code.AddLine(COMMON_DECLARATIONS); } void DeclareVertex() { @@ -744,7 +733,7 @@ private: case PixelImap::Unused: break; } - UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute); return {}; } @@ -777,16 +766,16 @@ private: name = "gs_" + name + "[]"; } - std::string suffix; + std::string suffix_; if (stage == ShaderType::Fragment) { const auto input_mode{header.ps.GetPixelImap(location)}; if (input_mode == PixelImap::Unused) { return; } - suffix = GetInputFlags(input_mode); + suffix_ = GetInputFlags(input_mode); } - code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix, name); + code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix_, name); } void DeclareOutputAttributes() { @@ -813,7 +802,7 @@ private: const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); const auto it = transform_feedback.find(location); if (it == transform_feedback.end()) { - return {}; + return std::nullopt; } return it->second.components; } @@ -865,20 +854,9 @@ private: } void DeclareConstantBuffers() { - if (use_unified_uniforms) { - const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + - static_cast<u32>(ir.GetGlobalMemory().size()); - code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", - binding); - code.AddLine(" uint cbufs[];"); - code.AddLine("}};"); - code.AddNewLine(); - return; - } - u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto [index, info] : ir.GetConstantBuffers()) { - const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; + for (const auto& [index, info] : ir.GetConstantBuffers()) { + const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32)); const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, GetConstBufferBlock(index)); @@ -1081,29 +1059,17 @@ private: if (const auto cbuf = std::get_if<CbufNode>(&*node)) { const Node offset = cbuf->GetOffset(); - const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - if (use_unified_uniforms) { - return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), - Type::Uint}; - } else { - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; - } - } - - // Indirect access - if (use_unified_uniforms) { - return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, - Visit(offset).AsUint()), + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), Type::Uint}; } + // Indirect access const std::string final_offset = code.GenerateTemporary(); code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); @@ -1251,7 +1217,7 @@ private: } break; } - UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); + UNIMPLEMENTED_MSG("Unhandled input attribute: {}", attribute); return {"0", Type::Int}; } @@ -1295,21 +1261,21 @@ private: switch (element) { case 0: UNIMPLEMENTED(); - return {}; + return std::nullopt; case 1: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_Layer", Type::Int}}; case 2: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_ViewportIndex", Type::Int}}; case 3: return {{"gl_PointSize", Type::Float}}; } - return {}; + return std::nullopt; case Attribute::Index::FrontColor: return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}}; case Attribute::Index::FrontSecondaryColor: @@ -1331,8 +1297,8 @@ private: GetSwizzle(element)), Type::Float}}; } - UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); - return {}; + UNIMPLEMENTED_MSG("Unhandled output attribute: {}", attribute); + return std::nullopt; } } @@ -1443,8 +1409,10 @@ private: return expr + ", vec2(0.0), vec2(0.0))"; case TextureType::TextureCube: return expr + ", vec3(0.0), vec3(0.0))"; + default: + UNREACHABLE(); + break; } - UNREACHABLE(); } for (const auto& variant : extras) { @@ -2054,15 +2022,19 @@ private: } Expression Texture(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - - std::string expr = GenerateTexture( - operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}}); - if (meta->sampler.is_shadow) { - expr = "vec4(" + expr + ')'; + const auto meta = std::get<MetaTexture>(operation.GetMeta()); + const bool separate_dc = meta.sampler.type == TextureType::TextureCube && + meta.sampler.is_array && meta.sampler.is_shadow; + // TODO: Replace this with an array and make GenerateTexture use C++20 std::span + const std::vector<TextureIR> extras{ + TextureOffset{}, + TextureArgument{Type::Float, meta.bias}, + }; + std::string expr = GenerateTexture(operation, "", extras, separate_dc); + if (meta.sampler.is_shadow) { + expr = fmt::format("vec4({})", expr); } - return {expr + GetSwizzle(meta->element), Type::Float}; + return {expr + GetSwizzle(meta.element), Type::Float}; } Expression TextureLod(Operation operation) { @@ -2094,13 +2066,13 @@ private: const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int; const bool separate_dc = meta.sampler.is_shadow; - std::vector<TextureIR> ir; + std::vector<TextureIR> ir_; if (meta.sampler.is_shadow) { - ir = {TextureOffset{}}; + ir_ = {TextureOffset{}}; } else { - ir = {TextureOffset{}, TextureArgument{type, meta.component}}; + ir_ = {TextureOffset{}, TextureArgument{type, meta.component}}; } - return {GenerateTexture(operation, "Gather", ir, separate_dc) + GetSwizzle(meta.element), + return {GenerateTexture(operation, "Gather", ir_, separate_dc) + GetSwizzle(meta.element), Type::Float}; } @@ -2287,7 +2259,6 @@ private: } } } - if (header.ps.omap.depth) { // The depth output is always 2 registers after the last color output, and current_reg // already contains one past the last color register. @@ -2331,7 +2302,8 @@ private: } Expression YNegate(Operation operation) { - return {"y_direction", Type::Float}; + // Y_NEGATE is mapped to this uniform value + return {"gl_FrontMaterial.ambient.a", Type::Float}; } template <u32 element> @@ -2746,11 +2718,11 @@ private: } } - std::string GetSampler(const Sampler& sampler) const { + std::string GetSampler(const SamplerEntry& sampler) const { return AppendSuffix(sampler.index, "sampler"); } - std::string GetImage(const Image& image) const { + std::string GetImage(const ImageEntry& image) const { return AppendSuffix(image.index, "image"); } @@ -2781,7 +2753,6 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; - const bool use_unified_uniforms; std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; @@ -2795,7 +2766,7 @@ std::string GetFlowVariable(u32 index) { class ExprDecompiler { public: - explicit ExprDecompiler(GLSLDecompiler& decomp) : decomp{decomp} {} + explicit ExprDecompiler(GLSLDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ExprAnd& expr) { inner += '('; @@ -2850,7 +2821,7 @@ private: class ASTDecompiler { public: - explicit ASTDecompiler(GLSLDecompiler& decomp) : decomp{decomp} {} + explicit ASTDecompiler(GLSLDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); @@ -2997,8 +2968,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s for (std::size_t i = 0; i < std::size(clip_distances); ++i) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } + for (const auto& buffer : entries.const_buffers) { + entries.enabled_uniform_buffers |= 1U << buffer.GetIndex(); + } entries.shader_length = ir.GetLength(); - entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 451c9689a..0397a000c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -20,13 +20,13 @@ namespace OpenGL { class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using SamplerEntry = VideoCommon::Shader::Sampler; -using ImageEntry = VideoCommon::Shader::Image; +using SamplerEntry = VideoCommon::Shader::SamplerEntry; +using ImageEntry = VideoCommon::Shader::ImageEntry; class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer { public: - explicit ConstBufferEntry(u32 max_offset, bool is_indirect, u32 index) - : VideoCommon::Shader::ConstBuffer{max_offset, is_indirect}, index{index} {} + explicit ConstBufferEntry(u32 max_offset_, bool is_indirect_, u32 index_) + : ConstBuffer{max_offset_, is_indirect_}, index{index_} {} u32 GetIndex() const { return index; @@ -37,10 +37,10 @@ private: }; struct GlobalMemoryEntry { - constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, - bool is_written) - : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ - is_written} {} + constexpr explicit GlobalMemoryEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_read_, + bool is_written_) + : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_read{is_read_}, is_written{ + is_written_} {} u32 cbuf_index = 0; u32 cbuf_offset = 0; @@ -55,7 +55,7 @@ struct ShaderEntries { std::vector<ImageEntry> images; std::size_t shader_length{}; u32 clip_distances{}; - bool use_unified_uniforms{}; + u32 enabled_uniform_buffers{}; }; ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 40c0877c1..955b2abc4 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -206,13 +206,17 @@ bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const { flat_bindless_samplers.size(); } -ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} +ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default; ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; +void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) { + title_id = title_id_; +} + std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() { // Skip games without title id - const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; + const bool has_title_id = title_id != 0; if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) { return std::nullopt; } @@ -313,8 +317,7 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo return std::nullopt; } } - - return std::move(entries); + return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { @@ -340,7 +343,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { } const u64 id = entry.unique_identifier; - if (stored_transferable.find(id) != stored_transferable.end()) { + if (stored_transferable.contains(id)) { // The shader already exists return; } @@ -474,7 +477,7 @@ std::string ShaderDiskCacheOpenGL::GetBaseDir() const { } std::string ShaderDiskCacheOpenGL::GetTitleID() const { - return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID()); + return fmt::format("{:016X}", title_id); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index db2bb73bc..aef841c1d 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -21,10 +21,6 @@ #include "video_core/engines/shader_type.h" #include "video_core/shader/registry.h" -namespace Core { -class System; -} - namespace Common::FS { class IOFile; } @@ -70,9 +66,12 @@ struct ShaderDiskCachePrecompiled { class ShaderDiskCacheOpenGL { public: - explicit ShaderDiskCacheOpenGL(Core::System& system); + explicit ShaderDiskCacheOpenGL(); ~ShaderDiskCacheOpenGL(); + /// Binds a title ID for all future operations. + void BindTitleID(u64 title_id); + /// Loads transferable cache. If file has a old version or on failure, it deletes the file. std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable(); @@ -157,8 +156,6 @@ private: return LoadArrayFromPrecompiled(&object, 1); } - Core::System& system; - // Stores whole precompiled cache which will be read from or saved to the precompiled chache // file FileSys::VectorVfsFile precompiled_cache_virtual_file; @@ -168,8 +165,11 @@ private: // Stored transferable shaders std::unordered_set<u64> stored_transferable; + /// Title ID to operate on + u64 title_id = 0; + // The cache has been loaded at boot - bool is_usable{}; + bool is_usable = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 691c6c79b..553e6e8d6 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -83,6 +83,21 @@ void ProgramManager::RestoreGuestPipeline() { } } +void ProgramManager::BindHostCompute(GLuint program) { + if (use_assembly_programs) { + glDisable(GL_COMPUTE_PROGRAM_NV); + } + glUseProgram(program); + is_graphics_bound = false; +} + +void ProgramManager::RestoreGuestCompute() { + if (use_assembly_programs) { + glEnable(GL_COMPUTE_PROGRAM_NV); + glUseProgram(0); + } +} + void ProgramManager::UseVertexShader(GLuint program) { if (use_assembly_programs) { BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled); diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 950e0dfcb..ad42cce74 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -45,6 +45,12 @@ public: /// Rewinds BindHostPipeline state changes. void RestoreGuestPipeline(); + /// Binds an OpenGL GLSL program object unsynchronized with the guest state. + void BindHostCompute(GLuint program); + + /// Rewinds BindHostCompute state changes. + void RestoreGuestCompute(); + void UseVertexShader(GLuint program); void UseGeometryShader(GLuint program); void UseFragmentShader(GLuint program); diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index d24fad3de..dbdf5230f 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -13,7 +13,7 @@ #include "video_core/renderer_opengl/gl_state_tracker.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace OpenGL { @@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) { FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); } -void SetupDirtyVertexArrays(Tables& tables) { - static constexpr std::size_t num_array = 3; +void SetupDirtyVertexInstances(Tables& tables) { static constexpr std::size_t instance_base_offset = 3; for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); - const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); - - FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); - FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); - const std::size_t instance_array_offset = array_offset + instance_base_offset; tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); tables[1][instance_array_offset] = VertexInstances; @@ -214,16 +208,14 @@ void SetupDirtyMisc(Tables& tables) { } // Anonymous namespace -StateTracker::StateTracker(Core::System& system) : system{system} {} - -void StateTracker::Initialize() { - auto& dirty = system.GPU().Maxwell3D().dirty; +StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { + auto& dirty = gpu.Maxwell3D().dirty; auto& tables = dirty.tables; - SetupDirtyRenderTargets(tables); + SetupDirtyFlags(tables); SetupDirtyColorMasks(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); - SetupDirtyVertexArrays(tables); + SetupDirtyVertexInstances(tables); SetupDirtyVertexFormat(tables); SetupDirtyShaders(tables); SetupDirtyPolygonModes(tables); @@ -243,12 +235,6 @@ void StateTracker::Initialize() { SetupDirtyClipControl(tables); SetupDirtyDepthClampEnabled(tables); SetupDirtyMisc(tables); - - auto& store = dirty.on_write_stores; - store[VertexBuffers] = true; - for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { - store[VertexBuffer0 + i] = true; - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 0f823288e..94c905116 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -13,8 +13,8 @@ #include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" -namespace Core { -class System; +namespace Tegra { +class GPU; } namespace OpenGL { @@ -28,10 +28,6 @@ enum : u8 { VertexFormat0, VertexFormat31 = VertexFormat0 + 31, - VertexBuffers, - VertexBuffer0, - VertexBuffer31 = VertexBuffer0 + 31, - VertexInstances, VertexInstance0, VertexInstance31 = VertexInstance0 + 31, @@ -90,9 +86,7 @@ static_assert(Last <= std::numeric_limits<u8>::max()); class StateTracker { public: - explicit StateTracker(Core::System& system); - - void Initialize(); + explicit StateTracker(Tegra::GPU& gpu); void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { @@ -102,14 +96,40 @@ public: glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, new_index_buffer); } + void BindFramebuffer(GLuint new_framebuffer) { + if (framebuffer == new_framebuffer) { + return; + } + framebuffer = new_framebuffer; + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); + } + + void ClipControl(GLenum new_origin, GLenum new_depth) { + if (new_origin == origin && new_depth == depth) { + return; + } + origin = new_origin; + depth = new_depth; + glClipControl(origin, depth); + } + + void SetYNegate(bool new_y_negate) { + if (new_y_negate == y_negate) { + return; + } + // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a + y_negate = new_y_negate; + const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f}; + glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data()); + } + void NotifyScreenDrawVertexArray() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true; - flags[OpenGL::Dirty::VertexBuffers] = true; - flags[OpenGL::Dirty::VertexBuffer0] = true; + flags[VideoCommon::Dirty::VertexBuffers] = true; + flags[VideoCommon::Dirty::VertexBuffer0] = true; flags[OpenGL::Dirty::VertexInstances] = true; flags[OpenGL::Dirty::VertexInstance0 + 0] = true; @@ -117,100 +137,87 @@ public: } void NotifyPolygonModes() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonModes] = true; flags[OpenGL::Dirty::PolygonModeFront] = true; flags[OpenGL::Dirty::PolygonModeBack] = true; } void NotifyViewport0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Viewports] = true; flags[OpenGL::Dirty::Viewport0] = true; } void NotifyScissor0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Scissors] = true; flags[OpenGL::Dirty::Scissor0] = true; } - void NotifyColorMask0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; + void NotifyColorMask(size_t index) { flags[OpenGL::Dirty::ColorMasks] = true; - flags[OpenGL::Dirty::ColorMask0] = true; + flags[OpenGL::Dirty::ColorMask0 + index] = true; } void NotifyBlend0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::BlendStates] = true; flags[OpenGL::Dirty::BlendState0] = true; } void NotifyFramebuffer() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[VideoCommon::Dirty::RenderTargets] = true; } void NotifyFrontFace() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FrontFace] = true; } void NotifyCullTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::CullTest] = true; } void NotifyDepthMask() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthMask] = true; } void NotifyDepthTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthTest] = true; } void NotifyStencilTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::StencilTest] = true; } void NotifyPolygonOffset() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonOffset] = true; } void NotifyRasterizeEnable() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::RasterizeEnable] = true; } void NotifyFramebufferSRGB() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FramebufferSRGB] = true; } void NotifyLogicOp() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::LogicOp] = true; } void NotifyClipControl() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::ClipControl] = true; } void NotifyAlphaTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::AlphaTest] = true; } private: - Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; + GLuint framebuffer = 0; GLuint index_buffer = 0; + GLenum origin = GL_LOWER_LEFT; + GLenum depth = GL_NEGATIVE_ONE_TO_ONE; + bool y_negate = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 887995cf4..77b3ee0fe 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -1,80 +1,64 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <tuple> -#include <vector> +#include <array> +#include <memory> +#include <span> + +#include <glad/glad.h> #include "common/alignment.h" #include "common/assert.h" -#include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" -MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", - MP_RGB(128, 128, 192)); - namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) - : buffer_size(size) { - gl_buffer.Create(); - - GLsizeiptr allocate_size = size; - if (vertex_data_usage) { - // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer - // read position is near the end and is an out-of-bound access to the vertex buffer. This is - // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the - // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the - // crash. - allocate_size *= 2; - } - - static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); - - if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +StreamBuffer::StreamBuffer() { + static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + buffer.Create(); + glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); + glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); + mapped_pointer = + static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); + for (OGLSync& sync : fences) { + sync.Create(); } } -OGLStreamBuffer::~OGLStreamBuffer() { - glUnmapNamedBuffer(gl_buffer.handle); - gl_buffer.Release(); -} - -std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { - ASSERT(size <= buffer_size); - ASSERT(alignment <= buffer_size); - mapped_size = size; - - if (alignment > 0) { - buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment); +std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { + ASSERT(size < REGION_SIZE); + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + fences[region].Create(); } + used_iterator = iterator; - bool invalidate = false; - if (buffer_pos + size > buffer_size) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - glInvalidateBufferData(gl_buffer.handle); - - buffer_pos = 0; - invalidate = true; + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); } - - return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); -} - -void OGLStreamBuffer::Unmap(GLsizeiptr size) { - ASSERT(size <= mapped_size); - - if (size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); + if (iterator + size >= free_iterator) { + free_iterator = iterator + size; } - - buffer_pos += size; + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + fences[region].Create(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; + + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return {std::span(mapped_pointer + offset, size), offset}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 307a67113..6dbb6bfba 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -1,55 +1,50 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once -#include <tuple> +#include <array> +#include <memory> +#include <span> +#include <utility> + #include <glad/glad.h> + #include "common/common_types.h" #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { -class Device; +class StreamBuffer { + static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024; + static constexpr size_t NUM_SYNCS = 16; + static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; + static constexpr size_t MAX_ALIGNMENT = 256; + static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0); + static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0); + static_assert(REGION_SIZE % MAX_ALIGNMENT == 0); -class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); - ~OGLStreamBuffer(); - - /* - * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes - * and the optional alignment requirement. - * If the buffer is full, the whole buffer is reallocated which invalidates old chunks. - * The return values are the pointer to the new chunk, the offset within the buffer, - * and the invalidation flag for previous chunks. - * The actual used size must be specified on unmapping the chunk. - */ - std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0); - - void Unmap(GLsizeiptr size); - - GLuint Handle() const { - return gl_buffer.handle; - } + explicit StreamBuffer(); - u64 Address() const { - return gpu_address; - } + [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept; - GLsizeiptr Size() const noexcept { - return buffer_size; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - OGLBuffer gl_buffer; + [[nodiscard]] static size_t Region(size_t offset) noexcept { + return offset / REGION_SIZE; + } - GLuint64EXT gpu_address = 0; - GLintptr buffer_pos = 0; - GLsizeiptr buffer_size = 0; - GLsizeiptr mapped_size = 0; - u8* mapped_ptr = nullptr; + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + u8* mapped_pointer = nullptr; + OGLBuffer buffer; + std::array<OGLSync, NUM_SYNCS> fences; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index f403f388a..12434db67 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -2,37 +2,60 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include "common/assert.h" -#include "common/bit_util.h" -#include "common/common_types.h" -#include "common/microprofile.h" -#include "common/scope_exit.h" -#include "core/core.h" -#include "video_core/morton.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" +#include <algorithm> +#include <array> +#include <bit> +#include <string> + +#include <glad/glad.h> + +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" -#include "video_core/renderer_opengl/utils.h" -#include "video_core/texture_cache/surface_base.h" +#include "video_core/renderer_opengl/maxwell_to_gl.h" +#include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache.h" -#include "video_core/textures/convert.h" -#include "video_core/textures/texture.h" +#include "video_core/textures/decoders.h" namespace OpenGL { -using Tegra::Texture::SwizzleSource; -using VideoCore::MortonSwizzleMode; +namespace { +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TextureMipmapFilter; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; +using VideoCommon::CalculateLevelStrideAlignment; +using VideoCommon::ImageCopy; +using VideoCommon::ImageFlagBits; +using VideoCommon::ImageType; +using VideoCommon::NUM_RT; +using VideoCommon::SamplesLog2; +using VideoCommon::SwizzleParameters; +using VideoCore::Surface::BytesPerBlock; +using VideoCore::Surface::IsPixelFormatASTC; +using VideoCore::Surface::IsPixelFormatSRGB; +using VideoCore::Surface::MaxPixelFormat; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", - MP_RGB(128, 192, 128)); +struct CopyOrigin { + GLint level; + GLint x; + GLint y; + GLint z; +}; -namespace { +struct CopyRegion { + GLsizei width; + GLsizei height; + GLsizei depth; +}; struct FormatTuple { GLenum internal_format; @@ -40,7 +63,7 @@ struct FormatTuple { GLenum type = GL_NONE; }; -constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ +constexpr std::array<FormatTuple, MaxPixelFormat> FORMAT_TABLE = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_UNORM {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // A8B8G8R8_SNORM {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE}, // A8B8G8R8_SINT @@ -103,72 +126,113 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM - // Compressed sRGB formats - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB - {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB - {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB - {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB - {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB - {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT - - // Depth formats - {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT - {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM - - // DepthStencil formats - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB + {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB + {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB + {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB + {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB + {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB + {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB + {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB + {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB + {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT + {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT + {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT }}; +constexpr std::array ACCELERATED_FORMATS{ + GL_RGBA32F, GL_RGBA16F, GL_RG32F, GL_RG16F, GL_R11F_G11F_B10F, GL_R32F, + GL_R16F, GL_RGBA32UI, GL_RGBA16UI, GL_RGB10_A2UI, GL_RGBA8UI, GL_RG32UI, + GL_RG16UI, GL_RG8UI, GL_R32UI, GL_R16UI, GL_R8UI, GL_RGBA32I, + GL_RGBA16I, GL_RGBA8I, GL_RG32I, GL_RG16I, GL_RG8I, GL_R32I, + GL_R16I, GL_R8I, GL_RGBA16, GL_RGB10_A2, GL_RGBA8, GL_RG16, + GL_RG8, GL_R16, GL_R8, GL_RGBA16_SNORM, GL_RGBA8_SNORM, GL_RG16_SNORM, + GL_RG8_SNORM, GL_R16_SNORM, GL_R8_SNORM, +}; + const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { - ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); - return tex_format_tuples[static_cast<std::size_t>(pixel_format)]; + ASSERT(static_cast<size_t>(pixel_format) < FORMAT_TABLE.size()); + return FORMAT_TABLE[static_cast<size_t>(pixel_format)]; } -GLenum GetTextureTarget(const SurfaceTarget& target) { - switch (target) { - case SurfaceTarget::TextureBuffer: +GLenum ImageTarget(const VideoCommon::ImageInfo& info) { + switch (info.type) { + case ImageType::e1D: + return GL_TEXTURE_1D_ARRAY; + case ImageType::e2D: + if (info.num_samples > 1) { + return GL_TEXTURE_2D_MULTISAMPLE_ARRAY; + } + return GL_TEXTURE_2D_ARRAY; + case ImageType::e3D: + return GL_TEXTURE_3D; + case ImageType::Linear: + return GL_TEXTURE_2D_ARRAY; + case ImageType::Buffer: return GL_TEXTURE_BUFFER; - case SurfaceTarget::Texture1D: + } + UNREACHABLE_MSG("Invalid image type={}", info.type); + return GL_NONE; +} + +GLenum ImageTarget(ImageViewType type, int num_samples = 1) { + const bool is_multisampled = num_samples > 1; + switch (type) { + case ImageViewType::e1D: return GL_TEXTURE_1D; - case SurfaceTarget::Texture2D: - return GL_TEXTURE_2D; - case SurfaceTarget::Texture3D: + case ImageViewType::e2D: + return is_multisampled ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D; + case ImageViewType::Cube: + return GL_TEXTURE_CUBE_MAP; + case ImageViewType::e3D: return GL_TEXTURE_3D; - case SurfaceTarget::Texture1DArray: + case ImageViewType::e1DArray: return GL_TEXTURE_1D_ARRAY; - case SurfaceTarget::Texture2DArray: - return GL_TEXTURE_2D_ARRAY; - case SurfaceTarget::TextureCubemap: - return GL_TEXTURE_CUBE_MAP; - case SurfaceTarget::TextureCubeArray: + case ImageViewType::e2DArray: + return is_multisampled ? GL_TEXTURE_2D_MULTISAMPLE_ARRAY : GL_TEXTURE_2D_ARRAY; + case ImageViewType::CubeArray: return GL_TEXTURE_CUBE_MAP_ARRAY; + case ImageViewType::Rect: + return GL_TEXTURE_RECTANGLE; + case ImageViewType::Buffer: + return GL_TEXTURE_BUFFER; } - UNREACHABLE(); - return {}; + UNREACHABLE_MSG("Invalid image view type={}", type); + return GL_NONE; } -GLint GetSwizzleSource(SwizzleSource source) { +GLenum TextureMode(PixelFormat format, bool is_first) { + switch (format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + case PixelFormat::S8_UINT_D24_UNORM: + return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + default: + UNREACHABLE(); + return GL_DEPTH_COMPONENT; + } +} + +GLint Swizzle(SwizzleSource source) { switch (source) { case SwizzleSource::Zero: return GL_ZERO; @@ -184,529 +248,844 @@ GLint GetSwizzleSource(SwizzleSource source) { case SwizzleSource::OneFloat: return GL_ONE; } - UNREACHABLE(); + UNREACHABLE_MSG("Invalid swizzle source={}", source); return GL_NONE; } -GLenum GetComponent(PixelFormat format, bool is_first) { - switch (format) { - case PixelFormat::D24_UNORM_S8_UINT: - case PixelFormat::D32_FLOAT_S8_UINT: - return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; - case PixelFormat::S8_UINT_D24_UNORM: - return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; +GLenum AttachmentType(PixelFormat format) { + switch (const SurfaceType type = VideoCore::Surface::GetFormatType(format); type) { + case SurfaceType::Depth: + return GL_DEPTH_ATTACHMENT; + case SurfaceType::DepthStencil: + return GL_DEPTH_STENCIL_ATTACHMENT; default: - UNREACHABLE(); - return GL_DEPTH_COMPONENT; + UNIMPLEMENTED_MSG("Unimplemented type={}", type); + return GL_NONE; } } -void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { - if (params.IsBuffer()) { - return; +[[nodiscard]] bool IsConverted(const Device& device, PixelFormat format, ImageType type) { + if (!device.HasASTC() && IsPixelFormatASTC(format)) { + return true; } - glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, static_cast<GLint>(params.num_levels - 1)); - if (params.num_levels == 1) { - glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f); + switch (format) { + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_UNORM: + return type == ImageType::e3D; + default: + break; } + return false; } -OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum internal_format, - OGLBuffer& texture_buffer) { - OGLTexture texture; - texture.Create(target); +[[nodiscard]] constexpr SwizzleSource ConvertGreenRed(SwizzleSource value) { + switch (value) { + case SwizzleSource::G: + return SwizzleSource::R; + default: + return value; + } +} - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureStorage1D(texture.handle, params.emulated_levels, internal_format, params.width); - break; - case SurfaceTarget::TextureBuffer: - texture_buffer.Create(); - glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(), - nullptr, GL_DYNAMIC_STORAGE_BIT); - glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); +void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4> swizzle) { + switch (format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + case PixelFormat::S8_UINT_D24_UNORM: + UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G); + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + TextureMode(format, swizzle[0] == SwizzleSource::R)); + std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); break; - case SurfaceTarget::Texture2D: - case SurfaceTarget::TextureCubemap: - glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, - params.height); + default: break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureStorage3D(texture.handle, params.emulated_levels, internal_format, params.width, - params.height, params.depth); + } + std::array<GLint, 4> gl_swizzle; + std::ranges::transform(swizzle, gl_swizzle.begin(), Swizzle); + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); +} + +[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, + const VideoCommon::ImageInfo& info) { + // Disable accelerated uploads for now as they don't implement swizzled uploads + return false; + switch (info.type) { + case ImageType::e2D: + case ImageType::e3D: + case ImageType::Linear: break; default: - UNREACHABLE(); + return false; + } + const GLenum internal_format = GetFormatTuple(info.format).internal_format; + const auto& format_info = runtime.FormatInfo(info.type, internal_format); + if (format_info.is_compressed) { + return false; + } + if (std::ranges::find(ACCELERATED_FORMATS, internal_format) == ACCELERATED_FORMATS.end()) { + return false; } + if (format_info.compatibility_by_size) { + return true; + } + const GLenum store_format = StoreFormat(BytesPerBlock(info.format)); + const GLenum store_class = runtime.FormatInfo(info.type, store_format).compatibility_class; + return format_info.compatibility_class == store_class; +} - ApplyTextureDefaults(params, texture.handle); +[[nodiscard]] CopyOrigin MakeCopyOrigin(VideoCommon::Offset3D offset, + VideoCommon::SubresourceLayers subresource, GLenum target) { + switch (target) { + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + return CopyOrigin{ + .level = static_cast<GLint>(subresource.base_level), + .x = static_cast<GLint>(offset.x), + .y = static_cast<GLint>(offset.y), + .z = static_cast<GLint>(subresource.base_layer), + }; + case GL_TEXTURE_3D: + return CopyOrigin{ + .level = static_cast<GLint>(subresource.base_level), + .x = static_cast<GLint>(offset.x), + .y = static_cast<GLint>(offset.y), + .z = static_cast<GLint>(offset.z), + }; + default: + UNIMPLEMENTED_MSG("Unimplemented copy target={}", target); + return CopyOrigin{.level = 0, .x = 0, .y = 0, .z = 0}; + } +} - return texture; +[[nodiscard]] CopyRegion MakeCopyRegion(VideoCommon::Extent3D extent, + VideoCommon::SubresourceLayers dst_subresource, + GLenum target) { + switch (target) { + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + return CopyRegion{ + .width = static_cast<GLsizei>(extent.width), + .height = static_cast<GLsizei>(extent.height), + .depth = static_cast<GLsizei>(dst_subresource.num_layers), + }; + case GL_TEXTURE_3D: + return CopyRegion{ + .width = static_cast<GLsizei>(extent.width), + .height = static_cast<GLsizei>(extent.height), + .depth = static_cast<GLsizei>(extent.depth), + }; + default: + UNIMPLEMENTED_MSG("Unimplemented copy target={}", target); + return CopyRegion{.width = 0, .height = 0, .depth = 0}; + } } -constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, - SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) { + if (False(image_view->flags & VideoCommon::ImageViewFlagBits::Slice)) { + const GLuint texture = image_view->DefaultHandle(); + glNamedFramebufferTexture(fbo, attachment, texture, 0); + return; + } + const GLuint texture = image_view->Handle(ImageViewType::e3D); + if (image_view->range.extent.layers > 1) { + // TODO: OpenGL doesn't support rendering to a fixed number of slices + glNamedFramebufferTexture(fbo, attachment, texture, 0); + } else { + const u32 slice = image_view->range.base.layer; + glNamedFramebufferTextureLayer(fbo, attachment, texture, 0, slice); + } } } // Anonymous namespace -CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : VideoCommon::SurfaceBase<View>(gpu_addr, params, is_astc_supported) { - if (is_converted) { - internal_format = params.srgb_conversion ? GL_SRGB8_ALPHA8 : GL_RGBA8; - format = GL_RGBA; - type = GL_UNSIGNED_BYTE; - } else { - const auto& tuple{GetFormatTuple(params.pixel_format)}; - internal_format = tuple.internal_format; - format = tuple.format; - type = tuple.type; - is_compressed = params.IsCompressed(); +ImageBufferMap::~ImageBufferMap() { + if (sync) { + sync->Create(); } - target = GetTextureTarget(params.target); - texture = CreateTexture(params, target, internal_format, texture_buffer); - DecorateSurfaceName(); +} - u32 num_layers = 1; - if (params.is_layered || params.target == SurfaceTarget::Texture3D) { - num_layers = params.depth; +TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, + StateTracker& state_tracker_) + : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager) { + static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; + for (size_t i = 0; i < TARGETS.size(); ++i) { + const GLenum target = TARGETS[i]; + for (const FormatTuple& tuple : FORMAT_TABLE) { + const GLenum format = tuple.internal_format; + GLint compat_class; + GLint compat_type; + GLint is_compressed; + glGetInternalformativ(target, format, GL_IMAGE_COMPATIBILITY_CLASS, 1, &compat_class); + glGetInternalformativ(target, format, GL_IMAGE_FORMAT_COMPATIBILITY_TYPE, 1, + &compat_type); + glGetInternalformativ(target, format, GL_TEXTURE_COMPRESSED, 1, &is_compressed); + const FormatProperties properties{ + .compatibility_class = static_cast<GLenum>(compat_class), + .compatibility_by_size = compat_type == GL_IMAGE_FORMAT_COMPATIBILITY_BY_SIZE, + .is_compressed = is_compressed == GL_TRUE, + }; + format_properties[i].emplace(format, properties); + } } - - main_view = - CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); + has_broken_texture_view_formats = device.HasBrokenTextureViewFormats(); + + null_image_1d_array.Create(GL_TEXTURE_1D_ARRAY); + null_image_cube_array.Create(GL_TEXTURE_CUBE_MAP_ARRAY); + null_image_3d.Create(GL_TEXTURE_3D); + null_image_rect.Create(GL_TEXTURE_RECTANGLE); + glTextureStorage2D(null_image_1d_array.handle, 1, GL_R8, 1, 1); + glTextureStorage3D(null_image_cube_array.handle, 1, GL_R8, 1, 1, 6); + glTextureStorage3D(null_image_3d.handle, 1, GL_R8, 1, 1, 1); + glTextureStorage2D(null_image_rect.handle, 1, GL_R8, 1, 1); + + std::array<GLuint, 4> new_handles; + glGenTextures(static_cast<GLsizei>(new_handles.size()), new_handles.data()); + null_image_view_1d.handle = new_handles[0]; + null_image_view_2d.handle = new_handles[1]; + null_image_view_2d_array.handle = new_handles[2]; + null_image_view_cube.handle = new_handles[3]; + glTextureView(null_image_view_1d.handle, GL_TEXTURE_1D, null_image_1d_array.handle, GL_R8, 0, 1, + 0, 1); + glTextureView(null_image_view_2d.handle, GL_TEXTURE_2D, null_image_cube_array.handle, GL_R8, 0, + 1, 0, 1); + glTextureView(null_image_view_2d_array.handle, GL_TEXTURE_2D_ARRAY, + null_image_cube_array.handle, GL_R8, 0, 1, 0, 1); + glTextureView(null_image_view_cube.handle, GL_TEXTURE_CUBE_MAP, null_image_cube_array.handle, + GL_R8, 0, 1, 0, 6); + const std::array texture_handles{ + null_image_1d_array.handle, null_image_cube_array.handle, null_image_3d.handle, + null_image_rect.handle, null_image_view_1d.handle, null_image_view_2d.handle, + null_image_view_2d_array.handle, null_image_view_cube.handle, + }; + for (const GLuint handle : texture_handles) { + static constexpr std::array NULL_SWIZZLE{GL_ZERO, GL_ZERO, GL_ZERO, GL_ZERO}; + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, NULL_SWIZZLE.data()); + } + const auto set_view = [this](ImageViewType type, GLuint handle) { + if (device.HasDebuggingToolAttached()) { + const std::string name = fmt::format("NullImage {}", type); + glObjectLabel(GL_TEXTURE, handle, static_cast<GLsizei>(name.size()), name.data()); + } + null_image_views[static_cast<size_t>(type)] = handle; + }; + set_view(ImageViewType::e1D, null_image_view_1d.handle); + set_view(ImageViewType::e2D, null_image_view_2d.handle); + set_view(ImageViewType::Cube, null_image_view_cube.handle); + set_view(ImageViewType::e3D, null_image_3d.handle); + set_view(ImageViewType::e1DArray, null_image_1d_array.handle); + set_view(ImageViewType::e2DArray, null_image_view_2d_array.handle); + set_view(ImageViewType::CubeArray, null_image_cube_array.handle); + set_view(ImageViewType::Rect, null_image_rect.handle); } -CachedSurface::~CachedSurface() = default; - -void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { - MICROPROFILE_SCOPE(OpenGL_Texture_Download); +TextureCacheRuntime::~TextureCacheRuntime() = default; - if (params.IsBuffer()) { - glGetNamedBufferSubData(texture_buffer.handle, 0, - static_cast<GLsizeiptr>(params.GetHostSizeInBytes(false)), - staging_buffer.data()); - return; - } +void TextureCacheRuntime::Finish() { + glFinish(); +} - SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); }); +ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { + return upload_buffers.RequestMap(size, true); +} - for (u32 level = 0; level < params.emulated_levels; ++level) { - glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level, is_converted))); - glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); - const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level, is_converted); +ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { + return download_buffers.RequestMap(size, false); +} - u8* const mip_data = staging_buffer.data() + mip_offset; - const GLsizei size = static_cast<GLsizei>(params.GetHostMipmapSize(level)); - if (is_compressed) { - glGetCompressedTextureImage(texture.handle, level, size, mip_data); - } else { - glGetTextureImage(texture.handle, level, format, type, size, mip_data); - } +void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image, + std::span<const ImageCopy> copies) { + const GLuint dst_name = dst_image.Handle(); + const GLuint src_name = src_image.Handle(); + const GLenum dst_target = ImageTarget(dst_image.info); + const GLenum src_target = ImageTarget(src_image.info); + for (const ImageCopy& copy : copies) { + const auto src_origin = MakeCopyOrigin(copy.src_offset, copy.src_subresource, src_target); + const auto dst_origin = MakeCopyOrigin(copy.dst_offset, copy.dst_subresource, dst_target); + const auto region = MakeCopyRegion(copy.extent, copy.dst_subresource, dst_target); + glCopyImageSubData(src_name, src_target, src_origin.level, src_origin.x, src_origin.y, + src_origin.z, dst_name, dst_target, dst_origin.level, dst_origin.x, + dst_origin.y, dst_origin.z, region.width, region.height, region.depth); } } -void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { - MICROPROFILE_SCOPE(OpenGL_Texture_Upload); - SCOPE_EXIT({ glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); }); - for (u32 level = 0; level < params.emulated_levels; ++level) { - UploadTextureMipmap(level, staging_buffer); +bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) { + if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) { + return false; } + return true; } -void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer) { - glPixelStorei(GL_UNPACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level, is_converted))); - glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); - - const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level, is_converted); - const u8* buffer{staging_buffer.data() + mip_offset}; - if (is_compressed) { - const auto image_size{static_cast<GLsizei>(params.GetHostMipmapSize(level))}; - switch (params.target) { - case SurfaceTarget::Texture2D: - glCompressedTextureSubImage2D(texture.handle, level, 0, 0, - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - internal_format, image_size, buffer); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glCompressedTextureSubImage3D(texture.handle, level, 0, 0, 0, - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - static_cast<GLsizei>(params.GetMipDepth(level)), - internal_format, image_size, buffer); - break; - case SurfaceTarget::TextureCubemap: { - const std::size_t layer_size{params.GetHostLayerSize(level)}; - for (std::size_t face = 0; face < params.depth; ++face) { - glCompressedTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), 1, - internal_format, static_cast<GLsizei>(layer_size), - buffer); - buffer += layer_size; - } - break; - } - default: - UNREACHABLE(); - } +void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src, + std::span<const ImageCopy> copies) { + if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) { + ASSERT(src.info.type == ImageType::e3D); + util_shaders.CopyBC4(dst, src, copies); } else { - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(texture.handle, level, 0, params.GetMipWidth(level), format, type, - buffer); - break; - case SurfaceTarget::TextureBuffer: - ASSERT(level == 0); - glNamedBufferSubData(texture_buffer.handle, 0, - params.GetMipWidth(level) * params.GetBytesPerPixel(), buffer); - break; - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2D: - glTextureSubImage2D(texture.handle, level, 0, 0, params.GetMipWidth(level), - params.GetMipHeight(level), format, type, buffer); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D( - texture.handle, level, 0, 0, 0, static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - static_cast<GLsizei>(params.GetMipDepth(level)), format, type, buffer); - break; - case SurfaceTarget::TextureCubemap: - for (std::size_t face = 0; face < params.depth; ++face) { - glTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), - params.GetMipWidth(level), params.GetMipHeight(level), 1, - format, type, buffer); - buffer += params.GetHostLayerSize(level); - } - break; - default: - UNREACHABLE(); - } + UNREACHABLE(); } } -void CachedSurface::DecorateSurfaceName() { - LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName()); -} +void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + state_tracker.NotifyScissor0(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); -void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) { - LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix); + ASSERT(dst->BufferBits() == src->BufferBits()); + + glEnable(GL_FRAMEBUFFER_SRGB); + glDisable(GL_RASTERIZER_DISCARD); + glDisablei(GL_SCISSOR_TEST, 0); + + const GLbitfield buffer_bits = dst->BufferBits(); + const bool has_depth = (buffer_bits & ~GL_COLOR_BUFFER_BIT) != 0; + const bool is_linear = !has_depth && filter == Tegra::Engines::Fermi2D::Filter::Bilinear; + glBlitNamedFramebuffer(src->Handle(), dst->Handle(), src_region[0].x, src_region[0].y, + src_region[1].x, src_region[1].y, dst_region[0].x, dst_region[0].y, + dst_region[1].x, dst_region[1].y, buffer_bits, + is_linear ? GL_LINEAR : GL_NEAREST); } -View CachedSurface::CreateView(const ViewParams& view_key) { - return CreateViewInner(view_key, false); +void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, + std::span<const SwizzleParameters> swizzles) { + switch (image.info.type) { + case ImageType::e2D: + return util_shaders.BlockLinearUpload2D(image, map, swizzles); + case ImageType::e3D: + return util_shaders.BlockLinearUpload3D(image, map, swizzles); + case ImageType::Linear: + return util_shaders.PitchUpload(image, map, swizzles); + default: + UNREACHABLE(); + break; + } } -View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_proxy) { - auto view = std::make_shared<CachedSurfaceView>(*this, view_key, is_proxy); - views[view_key] = view; - if (!is_proxy) - view->DecorateViewName(gpu_addr, params.TargetName() + "V:" + std::to_string(view_count++)); - return view; +void TextureCacheRuntime::InsertUploadMemoryBarrier() { + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT | GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } -CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, - bool is_proxy) - : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, - target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { - if (!is_proxy) { - main_view = CreateTextureView(); +FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal_format) const { + switch (type) { + case ImageType::e1D: + return format_properties[0].at(internal_format); + case ImageType::e2D: + case ImageType::Linear: + return format_properties[1].at(internal_format); + case ImageType::e3D: + return format_properties[2].at(internal_format); + default: + UNREACHABLE(); + return FormatProperties{}; } } -CachedSurfaceView::~CachedSurfaceView() = default; +TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) + : storage_flags{storage_flags_}, map_flags{map_flags_} {} -void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { - ASSERT(params.num_levels == 1); +TextureCacheRuntime::StagingBuffers::~StagingBuffers() = default; - if (params.target == SurfaceTarget::Texture3D) { - if (params.num_layers > 1) { - ASSERT(params.base_layer == 0); - glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); - } else { - glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, - params.base_level, params.base_layer); - } - return; +ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_size, + bool insert_fence) { + const size_t index = RequestBuffer(requested_size); + OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; + return ImageBufferMap{ + .mapped_span = std::span(maps[index], requested_size), + .sync = sync, + .buffer = buffers[index].handle, + }; +} + +size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { + if (const std::optional<size_t> index = FindBuffer(requested_size); index) { + return *index; } - if (params.num_layers > 1) { - UNIMPLEMENTED_IF(params.base_layer != 0); - glFramebufferTexture(fb_target, attachment, GetTexture(), 0); - return; + OGLBuffer& buffer = buffers.emplace_back(); + buffer.Create(); + glNamedBufferStorage(buffer.handle, requested_size, nullptr, + storage_flags | GL_MAP_PERSISTENT_BIT); + maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, requested_size, + map_flags | GL_MAP_PERSISTENT_BIT))); + + syncs.emplace_back(); + sizes.push_back(requested_size); + + ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && + maps.size() == sizes.size()); + + return buffers.size() - 1; +} + +std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t requested_size) { + size_t smallest_buffer = std::numeric_limits<size_t>::max(); + std::optional<size_t> found; + const size_t num_buffers = sizes.size(); + for (size_t index = 0; index < num_buffers; ++index) { + const size_t buffer_size = sizes[index]; + if (buffer_size < requested_size || buffer_size >= smallest_buffer) { + continue; + } + if (syncs[index].handle != 0) { + GLint status; + glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status); + if (status != GL_SIGNALED) { + continue; + } + syncs[index].Release(); + } + smallest_buffer = buffer_size; + found = index; } + return found; +} - const GLenum view_target = surface.GetTarget(); - const GLuint texture = surface.GetTexture(); - switch (surface.GetSurfaceParams().target) { - case SurfaceTarget::Texture1D: - glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); +Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, + VAddr cpu_addr_) + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_) { + if (CanBeAccelerated(runtime, info)) { + flags |= ImageFlagBits::AcceleratedUpload; + } + if (IsConverted(runtime.device, info.format, info.type)) { + flags |= ImageFlagBits::Converted; + gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + gl_format = GL_RGBA; + gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; + } else { + const auto& tuple = GetFormatTuple(info.format); + gl_internal_format = tuple.internal_format; + gl_format = tuple.format; + gl_type = tuple.type; + } + const GLenum target = ImageTarget(info); + const GLsizei width = info.size.width; + const GLsizei height = info.size.height; + const GLsizei depth = info.size.depth; + const int max_host_mip_levels = std::bit_width(info.size.width); + const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels); + const GLsizei num_layers = info.resources.layers; + const GLsizei num_samples = info.num_samples; + + GLuint handle = 0; + if (target != GL_TEXTURE_BUFFER) { + texture.Create(target); + handle = texture.handle; + } + switch (target) { + case GL_TEXTURE_1D_ARRAY: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers); + break; + case GL_TEXTURE_2D_ARRAY: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers); break; - case SurfaceTarget::Texture2D: - glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: { + // TODO: Where should 'fixedsamplelocations' come from? + const auto [samples_x, samples_y] = SamplesLog2(info.num_samples); + glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x, + height >> samples_y, num_layers, GL_FALSE); + break; + } + case GL_TEXTURE_RECTANGLE: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, height); break; - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, - params.base_layer); + case GL_TEXTURE_3D: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth); + break; + case GL_TEXTURE_BUFFER: + buffer.Create(); + glNamedBufferStorage(buffer.handle, guest_size_bytes, nullptr, 0); break; default: - UNIMPLEMENTED(); + UNREACHABLE_MSG("Invalid target=0x{:x}", target); + break; + } + if (runtime.device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(*this); + glObjectLabel(target == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, handle, + static_cast<GLsizei>(name.size()), name.data()); } } -GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { - if (GetSurfaceParams().IsBuffer()) { - return GetTexture(); - } - const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (current_swizzle == new_swizzle) { - return current_view; - } - current_swizzle = new_swizzle; +void Image::UploadMemory(const ImageBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); - const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); - OGLTextureView& view = entry->second; - if (!is_cache_miss) { - current_view = view.handle; - return view.handle; - } - view = CreateTextureView(); - current_view = view.handle; + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - std::array swizzle{x_source, y_source, z_source, w_source}; + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); - switch (const PixelFormat format = GetSurfaceParams().pixel_format) { - case PixelFormat::D24_UNORM_S8_UINT: - case PixelFormat::D32_FLOAT_S8_UINT: - case PixelFormat::S8_UINT_D24_UNORM: - UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); - glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE, - GetComponent(format, x_source == SwizzleSource::R)); - - // Make sure we sample the first component - std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) { - return value == SwizzleSource::G ? SwizzleSource::R : value; - }); - [[fallthrough]]; - default: { - const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]), - GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])}; - glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); - break; - } + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_UNPACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); + } + CopyBufferToImage(copy, map.offset); } - return view.handle; } -OGLTextureView CachedSurfaceView::CreateTextureView() const { - OGLTextureView texture_view; - texture_view.Create(); - - if (target == GL_TEXTURE_3D) { - glTextureView(texture_view.handle, target, surface.texture.handle, format, - params.base_level, params.num_levels, 0, 1); - } else { - glTextureView(texture_view.handle, target, surface.texture.handle, format, - params.base_level, params.num_levels, params.base_layer, params.num_layers); +void Image::UploadMemory(const ImageBufferMap& map, + std::span<const VideoCommon::BufferCopy> copies) { + for (const VideoCommon::BufferCopy& copy : copies) { + glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset, + copy.dst_offset, copy.size); } - ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); - - return texture_view; } -TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, - VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker) - : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} { - src_framebuffer.Create(); - dst_framebuffer.Create(); -} +void Image::DownloadMemory(ImageBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API -TextureCacheOpenGL::~TextureCacheOpenGL() = default; + glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); + glPixelStorei(GL_PACK_ALIGNMENT, 1); -Surface TextureCacheOpenGL::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { - return std::make_shared<CachedSurface>(gpu_addr, params, is_astc_supported); -} + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); -void TextureCacheOpenGL::ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) { - const auto& src_params = src_surface->GetSurfaceParams(); - const auto& dst_params = dst_surface->GetSurfaceParams(); - if (src_params.type != dst_params.type) { - // A fallback is needed - return; + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + } + CopyImageToBuffer(copy, map.offset); } - const auto src_handle = src_surface->GetTexture(); - const auto src_target = src_surface->GetTarget(); - const auto dst_handle = dst_surface->GetTexture(); - const auto dst_target = dst_surface->GetTarget(); - glCopyImageSubData(src_handle, src_target, copy_params.source_level, copy_params.source_x, - copy_params.source_y, copy_params.source_z, dst_handle, dst_target, - copy_params.dest_level, copy_params.dest_x, copy_params.dest_y, - copy_params.dest_z, copy_params.width, copy_params.height, - copy_params.depth); } -void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) { - const auto& src_params{src_view->GetSurfaceParams()}; - const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.depth != 1); - UNIMPLEMENTED_IF(dst_params.depth != 1); - - state_tracker.NotifyScissor0(); - state_tracker.NotifyFramebuffer(); - state_tracker.NotifyRasterizeEnable(); - state_tracker.NotifyFramebufferSRGB(); - - if (dst_params.srgb_conversion) { - glEnable(GL_FRAMEBUFFER_SRGB); - } else { - glDisable(GL_FRAMEBUFFER_SRGB); +GLuint Image::StorageHandle() noexcept { + switch (info.format) { + case PixelFormat::A8B8G8R8_SRGB: + case PixelFormat::B8G8R8A8_SRGB: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_SRGB: + case PixelFormat::ASTC_2D_4X4_SRGB: + case PixelFormat::ASTC_2D_8X8_SRGB: + case PixelFormat::ASTC_2D_8X5_SRGB: + case PixelFormat::ASTC_2D_5X4_SRGB: + case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X8_SRGB: + case PixelFormat::ASTC_2D_6X6_SRGB: + case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X12_SRGB: + case PixelFormat::ASTC_2D_8X6_SRGB: + case PixelFormat::ASTC_2D_6X5_SRGB: + if (store_view.handle != 0) { + return store_view.handle; + } + store_view.Create(); + glTextureView(store_view.handle, ImageTarget(info), texture.handle, GL_RGBA8, 0, + info.resources.levels, 0, info.resources.layers); + return store_view.handle; + default: + return texture.handle; } - glDisable(GL_RASTERIZER_DISCARD); - glDisablei(GL_SCISSOR_TEST, 0); - - glBindFramebuffer(GL_READ_FRAMEBUFFER, src_framebuffer.handle); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer.handle); - - GLenum buffers = 0; - if (src_params.type == SurfaceType::ColorTexture) { - src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); +} - dst_view->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); +void Image::CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset) { + // Compressed formats don't have a pixel format or type + const bool is_compressed = gl_format == GL_NONE; + const void* const offset = reinterpret_cast<const void*>(copy.buffer_offset + buffer_offset); - buffers = GL_COLOR_BUFFER_BIT; - } else if (src_params.type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - src_view->Attach(GL_DEPTH_ATTACHMENT, GL_READ_FRAMEBUFFER); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); + switch (info.type) { + case ImageType::e1D: + if (is_compressed) { + glCompressedTextureSubImage2D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_subresource.base_layer, + copy.image_extent.width, + copy.image_subresource.num_layers, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage2D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_subresource.base_layer, + copy.image_extent.width, copy.image_subresource.num_layers, + gl_format, gl_type, offset); + } + break; + case ImageType::e2D: + case ImageType::Linear: + if (is_compressed) { + glCompressedTextureSubImage3D( + texture.handle, copy.image_subresource.base_level, copy.image_offset.x, + copy.image_offset.y, copy.image_subresource.base_layer, copy.image_extent.width, + copy.image_extent.height, copy.image_subresource.num_layers, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage3D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_offset.y, + copy.image_subresource.base_layer, copy.image_extent.width, + copy.image_extent.height, copy.image_subresource.num_layers, + gl_format, gl_type, offset); + } + break; + case ImageType::e3D: + if (is_compressed) { + glCompressedTextureSubImage3D( + texture.handle, copy.image_subresource.base_level, copy.image_offset.x, + copy.image_offset.y, copy.image_offset.z, copy.image_extent.width, + copy.image_extent.height, copy.image_extent.depth, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage3D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_offset.y, copy.image_offset.z, + copy.image_extent.width, copy.image_extent.height, + copy.image_extent.depth, gl_format, gl_type, offset); + } + break; + default: + UNREACHABLE(); + } +} - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - dst_view->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); +void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset) { + const GLint x_offset = copy.image_offset.x; + const GLsizei width = copy.image_extent.width; - buffers = GL_DEPTH_BUFFER_BIT; - } else if (src_params.type == SurfaceType::DepthStencil) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - src_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_READ_FRAMEBUFFER); + const GLint level = copy.image_subresource.base_level; + const GLsizei buffer_size = static_cast<GLsizei>(copy.buffer_size); + void* const offset = reinterpret_cast<void*>(copy.buffer_offset + buffer_offset); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - dst_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + GLint y_offset = 0; + GLint z_offset = 0; + GLsizei height = 1; + GLsizei depth = 1; - buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; + switch (info.type) { + case ImageType::e1D: + y_offset = copy.image_subresource.base_layer; + height = copy.image_subresource.num_layers; + break; + case ImageType::e2D: + case ImageType::Linear: + y_offset = copy.image_offset.y; + z_offset = copy.image_subresource.base_layer; + height = copy.image_extent.height; + depth = copy.image_subresource.num_layers; + break; + case ImageType::e3D: + y_offset = copy.image_offset.y; + z_offset = copy.image_offset.z; + height = copy.image_extent.height; + depth = copy.image_extent.depth; + break; + default: + UNREACHABLE(); + } + // Compressed formats don't have a pixel format or type + const bool is_compressed = gl_format == GL_NONE; + if (is_compressed) { + glGetCompressedTextureSubImage(texture.handle, level, x_offset, y_offset, z_offset, width, + height, depth, buffer_size, offset); + } else { + glGetTextureSubImage(texture.handle, level, x_offset, y_offset, z_offset, width, height, + depth, gl_format, gl_type, buffer_size, offset); } - - const Common::Rectangle<u32>& src_rect = copy_config.src_rect; - const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect; - const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - - glBlitFramebuffer(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.top), - static_cast<GLint>(src_rect.right), static_cast<GLint>(src_rect.bottom), - static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.top), - static_cast<GLint>(dst_rect.right), static_cast<GLint>(dst_rect.bottom), - buffers, - is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST); } -void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) { - MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy); - const auto& src_params = src_surface->GetSurfaceParams(); - const auto& dst_params = dst_surface->GetSurfaceParams(); - UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1); +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, + ImageId image_id_, Image& image) + : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { + const Device& device = runtime.device; + if (True(image.flags & ImageFlagBits::Converted)) { + internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + } else { + internal_format = GetFormatTuple(format).internal_format; + } + VideoCommon::SubresourceRange flatten_range = info.range; + std::array<GLuint, 2> handles; + stored_views.reserve(2); - const auto source_format = GetFormatTuple(src_params.pixel_format); - const auto dest_format = GetFormatTuple(dst_params.pixel_format); + switch (info.type) { + case ImageViewType::e1DArray: + flatten_range.extent.layers = 1; + [[fallthrough]]; + case ImageViewType::e1D: + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::e1D, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::e1DArray, handles[1], info, info.range); + break; + case ImageViewType::e2DArray: + flatten_range.extent.layers = 1; + [[fallthrough]]; + case ImageViewType::e2D: + if (True(flags & VideoCommon::ImageViewFlagBits::Slice)) { + // 2D and 2D array views on a 3D textures are used exclusively for render targets + ASSERT(info.range.extent.levels == 1); + const VideoCommon::SubresourceRange slice_range{ + .base = {.level = info.range.base.level, .layer = 0}, + .extent = {.levels = 1, .layers = 1}, + }; + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::e3D, handles[0], info, slice_range); + break; + } + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::e2D, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::e2DArray, handles[1], info, info.range); + break; + case ImageViewType::e3D: + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::e3D, handles[0], info, info.range); + break; + case ImageViewType::CubeArray: + flatten_range.extent.layers = 6; + [[fallthrough]]; + case ImageViewType::Cube: + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::Cube, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::CubeArray, handles[1], info, info.range); + break; + case ImageViewType::Rect: + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::Rect, handles[0], info, info.range); + break; + case ImageViewType::Buffer: + glCreateTextures(GL_TEXTURE_BUFFER, 1, handles.data()); + SetupView(device, image, ImageViewType::Buffer, handles[0], info, info.range); + break; + } + default_handle = Handle(info.type); +} - const std::size_t source_size = src_surface->GetHostSizeInBytes(); - const std::size_t dest_size = dst_surface->GetHostSizeInBytes(); +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageParams& params) + : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} - const std::size_t buffer_size = std::max(source_size, dest_size); +void ImageView::SetupView(const Device& device, Image& image, ImageViewType view_type, + GLuint handle, const VideoCommon::ImageViewInfo& info, + VideoCommon::SubresourceRange view_range) { + if (info.type == ImageViewType::Buffer) { + // TODO: Take offset from buffer cache + glTextureBufferRange(handle, internal_format, image.buffer.handle, 0, + image.guest_size_bytes); + } else { + const GLuint parent = image.texture.handle; + const GLenum target = ImageTarget(view_type, image.info.num_samples); + glTextureView(handle, target, parent, internal_format, view_range.base.level, + view_range.extent.levels, view_range.base.layer, view_range.extent.layers); + if (!info.IsRenderTarget()) { + ApplySwizzle(handle, format, info.Swizzle()); + } + } + if (device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(*this, view_type); + glObjectLabel(GL_TEXTURE, handle, static_cast<GLsizei>(name.size()), name.data()); + } + stored_views.emplace_back().handle = handle; + views[static_cast<size_t>(view_type)] = handle; +} - GLuint copy_pbo_handle = FetchPBO(buffer_size); +Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { + const GLenum compare_mode = config.depth_compare_enabled ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE; + const GLenum compare_func = MaxwellToGL::DepthCompareFunc(config.depth_compare_func); + const GLenum mag = MaxwellToGL::TextureFilterMode(config.mag_filter, TextureMipmapFilter::None); + const GLenum min = MaxwellToGL::TextureFilterMode(config.min_filter, config.mipmap_filter); + const GLenum reduction_filter = MaxwellToGL::ReductionFilter(config.reduction_filter); + const GLint seamless = config.cubemap_interface_filtering ? GL_TRUE : GL_FALSE; + + UNIMPLEMENTED_IF(config.cubemap_anisotropy != 1); + UNIMPLEMENTED_IF(config.float_coord_normalization != 0); + + sampler.Create(); + const GLuint handle = sampler.handle; + glSamplerParameteri(handle, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(config.wrap_u)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(config.wrap_v)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(config.wrap_p)); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_MODE, compare_mode); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_FUNC, compare_func); + glSamplerParameteri(handle, GL_TEXTURE_MAG_FILTER, mag); + glSamplerParameteri(handle, GL_TEXTURE_MIN_FILTER, min); + glSamplerParameterf(handle, GL_TEXTURE_LOD_BIAS, config.LodBias()); + glSamplerParameterf(handle, GL_TEXTURE_MIN_LOD, config.MinLod()); + glSamplerParameterf(handle, GL_TEXTURE_MAX_LOD, config.MaxLod()); + glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); + + if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { + glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, config.MaxAnisotropy()); + } else { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); + } + if (GLAD_GL_ARB_texture_filter_minmax || GLAD_GL_EXT_texture_filter_minmax) { + glSamplerParameteri(handle, GL_TEXTURE_REDUCTION_MODE_ARB, reduction_filter); + } else if (reduction_filter != GL_WEIGHTED_AVERAGE_ARB) { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_minmax is required"); + } + if (GLAD_GL_ARB_seamless_cubemap_per_texture || GLAD_GL_AMD_seamless_cubemap_per_texture) { + glSamplerParameteri(handle, GL_TEXTURE_CUBE_MAP_SEAMLESS, seamless); + } else if (seamless == GL_FALSE) { + // We default to false because it's more common + LOG_WARNING(Render_OpenGL, "GL_ARB_seamless_cubemap_per_texture is required"); + } +} - glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); +Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { + // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of + // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared + // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with + // mismatching size, this is why core framebuffers are preferred. + GLuint handle; + glGenFramebuffers(1, &handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, handle); + + GLsizei num_buffers = 0; + std::array<GLenum, NUM_RT> gl_draw_buffers; + gl_draw_buffers.fill(GL_NONE); + + for (size_t index = 0; index < color_buffers.size(); ++index) { + const ImageView* const image_view = color_buffers[index]; + if (!image_view) { + continue; + } + buffer_bits |= GL_COLOR_BUFFER_BIT; + gl_draw_buffers[index] = GL_COLOR_ATTACHMENT0 + key.draw_buffers[index]; + num_buffers = static_cast<GLsizei>(index + 1); - if (src_surface->IsCompressed()) { - glGetCompressedTextureImage(src_surface->GetTexture(), 0, static_cast<GLsizei>(source_size), - nullptr); - } else { - glGetTextureImage(src_surface->GetTexture(), 0, source_format.format, source_format.type, - static_cast<GLsizei>(source_size), nullptr); + const GLenum attachment = static_cast<GLenum>(GL_COLOR_ATTACHMENT0 + index); + AttachTexture(handle, attachment, image_view); } - glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo_handle); + if (const ImageView* const image_view = depth_buffer; image_view) { + if (GetFormatType(image_view->format) == SurfaceType::DepthStencil) { + buffer_bits |= GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; + } else { + buffer_bits |= GL_DEPTH_BUFFER_BIT; + } + const GLenum attachment = AttachmentType(image_view->format); + AttachTexture(handle, attachment, image_view); + } - const GLsizei width = static_cast<GLsizei>(dst_params.width); - const GLsizei height = static_cast<GLsizei>(dst_params.height); - const GLsizei depth = static_cast<GLsizei>(dst_params.depth); - if (dst_surface->IsCompressed()) { - LOG_CRITICAL(HW_GPU, "Compressed buffer copy is unimplemented!"); - UNREACHABLE(); + if (num_buffers > 1) { + glNamedFramebufferDrawBuffers(handle, num_buffers, gl_draw_buffers.data()); + } else if (num_buffers > 0) { + glNamedFramebufferDrawBuffer(handle, gl_draw_buffers[0]); } else { - switch (dst_params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(dst_surface->GetTexture(), 0, 0, width, dest_format.format, - dest_format.type, nullptr); - break; - case SurfaceTarget::Texture2D: - glTextureSubImage2D(dst_surface->GetTexture(), 0, 0, 0, width, height, - dest_format.format, dest_format.type, nullptr); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, - dest_format.format, dest_format.type, nullptr); - break; - case SurfaceTarget::TextureCubemap: - glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, - dest_format.format, dest_format.type, nullptr); - break; - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(dst_params.target)); - UNREACHABLE(); - } + glNamedFramebufferDrawBuffer(handle, GL_NONE); } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glTextureBarrier(); -} + glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_WIDTH, key.size.width); + glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_HEIGHT, key.size.height); + // TODO + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_LAYERS, ...); + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_SAMPLES, ...); + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS, ...); -GLuint TextureCacheOpenGL::FetchPBO(std::size_t buffer_size) { - ASSERT_OR_EXECUTE(buffer_size > 0, { return 0; }); - const u32 l2 = Common::Log2Ceil64(static_cast<u64>(buffer_size)); - OGLBuffer& cp = copy_pbo_cache[l2]; - if (cp.handle == 0) { - const std::size_t ceil_size = 1ULL << l2; - cp.Create(); - cp.MakeStreamCopy(ceil_size); + if (runtime.device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(key); + glObjectLabel(GL_FRAMEBUFFER, handle, static_cast<GLsizei>(name.size()), name.data()); } - return cp.handle; + framebuffer.handle = handle; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index de8f18489..a6172f009 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -4,154 +4,241 @@ #pragma once -#include <array> -#include <functional> #include <memory> -#include <unordered_map> -#include <utility> -#include <vector> +#include <span> #include <glad/glad.h> -#include "common/common_types.h" -#include "video_core/engines/shader_bytecode.h" -#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/texture_cache.h" namespace OpenGL { -using VideoCommon::SurfaceParams; -using VideoCommon::ViewParams; - -class CachedSurfaceView; -class CachedSurface; -class TextureCacheOpenGL; +class Device; +class ProgramManager; class StateTracker; -using Surface = std::shared_ptr<CachedSurface>; -using View = std::shared_ptr<CachedSurfaceView>; -using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; +class Framebuffer; +class Image; +class ImageView; +class Sampler; + +using VideoCommon::ImageId; +using VideoCommon::ImageViewId; +using VideoCommon::ImageViewType; +using VideoCommon::NUM_RT; +using VideoCommon::Offset2D; +using VideoCommon::RenderTargets; + +struct ImageBufferMap { + ~ImageBufferMap(); + + std::span<u8> mapped_span; + size_t offset = 0; + OGLSync* sync; + GLuint buffer; +}; + +struct FormatProperties { + GLenum compatibility_class; + bool compatibility_by_size; + bool is_compressed; +}; -class CachedSurface final : public VideoCommon::SurfaceBase<View> { - friend CachedSurfaceView; +class TextureCacheRuntime { + friend Framebuffer; + friend Image; + friend ImageView; + friend Sampler; public: - explicit CachedSurface(GPUVAddr gpu_addr, const SurfaceParams& params, bool is_astc_supported); - ~CachedSurface(); + explicit TextureCacheRuntime(const Device& device, ProgramManager& program_manager, + StateTracker& state_tracker); + ~TextureCacheRuntime(); - void UploadTexture(const std::vector<u8>& staging_buffer) override; - void DownloadTexture(std::vector<u8>& staging_buffer) override; + void Finish(); - GLenum GetTarget() const { - return target; - } + ImageBufferMap UploadStagingBuffer(size_t size); - GLuint GetTexture() const { - return texture.handle; - } + ImageBufferMap DownloadStagingBuffer(size_t size); + + void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); - bool IsCompressed() const { - return is_compressed; + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { + UNIMPLEMENTED(); } -protected: - void DecorateSurfaceName() override; + bool CanImageBeCopied(const Image& dst, const Image& src); + + void EmulateCopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + + void BlitFramebuffer(Framebuffer* dst, Framebuffer* src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void AccelerateImageUpload(Image& image, const ImageBufferMap& map, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void InsertUploadMemoryBarrier(); - View CreateView(const ViewParams& view_key) override; - View CreateViewInner(const ViewParams& view_key, bool is_proxy); + FormatProperties FormatInfo(VideoCommon::ImageType type, GLenum internal_format) const; + + bool HasBrokenTextureViewFormats() const noexcept { + return has_broken_texture_view_formats; + } private: - void UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer); + struct StagingBuffers { + explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); + ~StagingBuffers(); - GLenum internal_format{}; - GLenum format{}; - GLenum type{}; - bool is_compressed{}; - GLenum target{}; - u32 view_count{}; + ImageBufferMap RequestMap(size_t requested_size, bool insert_fence); - OGLTexture texture; - OGLBuffer texture_buffer; + size_t RequestBuffer(size_t requested_size); + + std::optional<size_t> FindBuffer(size_t requested_size); + + std::vector<OGLSync> syncs; + std::vector<OGLBuffer> buffers; + std::vector<u8*> maps; + std::vector<size_t> sizes; + GLenum storage_flags; + GLenum map_flags; + }; + + const Device& device; + StateTracker& state_tracker; + UtilShaders util_shaders; + + std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties; + bool has_broken_texture_view_formats = false; + + StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; + StagingBuffers download_buffers{GL_MAP_READ_BIT, GL_MAP_READ_BIT}; + + OGLTexture null_image_1d_array; + OGLTexture null_image_cube_array; + OGLTexture null_image_3d; + OGLTexture null_image_rect; + OGLTextureView null_image_view_1d; + OGLTextureView null_image_view_2d; + OGLTextureView null_image_view_2d_array; + OGLTextureView null_image_view_cube; + + std::array<GLuint, VideoCommon::NUM_IMAGE_VIEW_TYPES> null_image_views; }; -class CachedSurfaceView final : public VideoCommon::ViewBase { +class Image : public VideoCommon::ImageBase { + friend ImageView; + public: - explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); - ~CachedSurfaceView(); + explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, + VAddr cpu_addr); - /// @brief Attaches this texture view to the currently bound fb_target framebuffer - /// @param attachment Attachment to bind textures to - /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) - void Attach(GLenum attachment, GLenum fb_target) const; + void UploadMemory(const ImageBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies); - GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies); - void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix); + void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); - void MarkAsModified(u64 tick) { - surface.MarkAsModified(true, tick); + GLuint StorageHandle() noexcept; + + GLuint Handle() const noexcept { + return texture.handle; } - GLuint GetTexture() const { - if (is_proxy) { - return surface.GetTexture(); - } - return main_view.handle; +private: + void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); + + void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); + + OGLTexture texture; + OGLBuffer buffer; + OGLTextureView store_view; + GLenum gl_internal_format = GL_NONE; + GLenum gl_format = GL_NONE; + GLenum gl_type = GL_NONE; +}; + +class ImageView : public VideoCommon::ImageViewBase { + friend Image; + +public: + explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); + + [[nodiscard]] GLuint Handle(ImageViewType query_type) const noexcept { + return views[static_cast<size_t>(query_type)]; } - GLenum GetFormat() const { - return format; + [[nodiscard]] GLuint DefaultHandle() const noexcept { + return default_handle; } - const SurfaceParams& GetSurfaceParams() const { - return surface.GetSurfaceParams(); + [[nodiscard]] GLenum Format() const noexcept { + return internal_format; } private: - OGLTextureView CreateTextureView() const; + void SetupView(const Device& device, Image& image, ImageViewType view_type, GLuint handle, + const VideoCommon::ImageViewInfo& info, + VideoCommon::SubresourceRange view_range); + + std::array<GLuint, VideoCommon::NUM_IMAGE_VIEW_TYPES> views{}; + std::vector<OGLTextureView> stored_views; + GLuint default_handle = 0; + GLenum internal_format = GL_NONE; +}; - CachedSurface& surface; - const GLenum format; - const GLenum target; - const bool is_proxy; +class ImageAlloc : public VideoCommon::ImageAllocBase {}; - std::unordered_map<u32, OGLTextureView> view_cache; - OGLTextureView main_view; +class Sampler { +public: + explicit Sampler(TextureCacheRuntime&, const Tegra::Texture::TSCEntry&); + + GLuint Handle() const noexcept { + return sampler.handle; + } - // Use an invalid default so it always fails the comparison test - u32 current_swizzle = 0xffffffff; - GLuint current_view = 0; +private: + OGLSampler sampler; }; -class TextureCacheOpenGL final : public TextureCacheBase { +class Framebuffer { public: - explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker); - ~TextureCacheOpenGL(); - -protected: - Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; - - void ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) override; + explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key); - void ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) override; + [[nodiscard]] GLuint Handle() const noexcept { + return framebuffer.handle; + } - void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + [[nodiscard]] GLbitfield BufferBits() const noexcept { + return buffer_bits; + } private: - GLuint FetchPBO(std::size_t buffer_size); - - StateTracker& state_tracker; + OGLFramebuffer framebuffer; + GLbitfield buffer_bits = GL_NONE; +}; - OGLFramebuffer src_framebuffer; - OGLFramebuffer dst_framebuffer; - std::unordered_map<u32, OGLBuffer> copy_pbo_cache; +struct TextureCacheParams { + static constexpr bool ENABLE_VALIDATION = true; + static constexpr bool FRAMEBUFFER_BLITS = true; + static constexpr bool HAS_EMULATED_COPIES = true; + + using Runtime = OpenGL::TextureCacheRuntime; + using Image = OpenGL::Image; + using ImageAlloc = OpenGL::ImageAlloc; + using ImageView = OpenGL::ImageView; + using Sampler = OpenGL::Sampler; + using Framebuffer = OpenGL::Framebuffer; }; +using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index fe9bd4b5a..f7ad8f370 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -4,23 +4,10 @@ #pragma once -#include <array> #include <glad/glad.h> -#include "common/common_types.h" -#include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" -namespace OpenGL { - -using GLvec2 = std::array<GLfloat, 2>; -using GLvec3 = std::array<GLfloat, 3>; -using GLvec4 = std::array<GLfloat, 4>; - -using GLuvec2 = std::array<GLuint, 2>; -using GLuvec3 = std::array<GLuint, 3>; -using GLuvec4 = std::array<GLuint, 4>; - -namespace MaxwellToGL { +namespace OpenGL::MaxwellToGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; @@ -47,6 +34,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { return GL_UNSIGNED_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; + default: + break; } break; case Maxwell::VertexAttribute::Type::SignedNorm: @@ -70,6 +59,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { return GL_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; + default: + break; } break; case Maxwell::VertexAttribute::Type::Float: @@ -84,6 +75,8 @@ inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32: case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; + default: + break; } break; } @@ -101,7 +94,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format)); + UNREACHABLE_MSG("Invalid index_format={}", index_format); return {}; } @@ -138,7 +131,7 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { case Maxwell::PrimitiveTopology::Patches: return GL_PATCHES; } - UNREACHABLE_MSG("Invalid topology={}", static_cast<int>(topology)); + UNREACHABLE_MSG("Invalid topology={}", topology); return GL_POINTS; } @@ -166,8 +159,8 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, } break; } - UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", - static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode)); + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", filter_mode, + mipmap_filter_mode); return GL_NEAREST; } @@ -198,7 +191,7 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { return GL_MIRROR_CLAMP_TO_EDGE; } } - UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", wrap_mode); return GL_REPEAT; } @@ -221,7 +214,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", func); return GL_GREATER; } @@ -243,7 +236,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", equation); return GL_FUNC_ADD; } @@ -307,27 +300,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); - return GL_ZERO; -} - -inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { - switch (source) { - case Tegra::Texture::SwizzleSource::Zero: - return GL_ZERO; - case Tegra::Texture::SwizzleSource::R: - return GL_RED; - case Tegra::Texture::SwizzleSource::G: - return GL_GREEN; - case Tegra::Texture::SwizzleSource::B: - return GL_BLUE; - case Tegra::Texture::SwizzleSource::A: - return GL_ALPHA; - case Tegra::Texture::SwizzleSource::OneInt: - case Tegra::Texture::SwizzleSource::OneFloat: - return GL_ONE; - } - UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", factor); return GL_ZERO; } @@ -358,7 +331,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", comparison); return GL_ALWAYS; } @@ -389,7 +362,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", stencil); return GL_KEEP; } @@ -400,7 +373,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", front_face); return GL_CCW; } @@ -413,7 +386,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", cull_face); return GL_BACK; } @@ -452,7 +425,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", operation); return GL_COPY; } @@ -465,14 +438,26 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { case Maxwell::PolygonMode::Fill: return GL_FILL; } - UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode)); + UNREACHABLE_MSG("Invalid polygon mode={}", polygon_mode); return GL_FILL; } +inline GLenum ReductionFilter(Tegra::Texture::SamplerReduction filter) { + switch (filter) { + case Tegra::Texture::SamplerReduction::WeightedAverage: + return GL_WEIGHTED_AVERAGE_ARB; + case Tegra::Texture::SamplerReduction::Min: + return GL_MIN; + case Tegra::Texture::SamplerReduction::Max: + return GL_MAX; + } + UNREACHABLE_MSG("Invalid reduction filter={}", static_cast<int>(filter)); + return GL_WEIGHTED_AVERAGE_ARB; +} + inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) { // Enumeration order matches register order. We can convert it arithmetically. return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle); } -} // namespace MaxwellToGL -} // namespace OpenGL +} // namespace OpenGL::MaxwellToGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b759c2dba..9d2acd4d9 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -23,29 +23,13 @@ #include "core/telemetry_session.h" #include "video_core/host_shaders/opengl_present_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" -#include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/textures/decoders.h" namespace OpenGL { - namespace { - -constexpr std::size_t SWAP_CHAIN_SIZE = 3; - -struct Frame { - u32 width{}; /// Width of the frame (to detect resize) - u32 height{}; /// Height of the frame - bool color_reloaded{}; /// Texture attachment was recreated (ie: resized) - OpenGL::OGLRenderbuffer color{}; /// Buffer shared between the render/present FBO - OpenGL::OGLFramebuffer render{}; /// FBO created on the render thread - OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread - GLsync render_fence{}; /// Fence created on the render thread - GLsync present_fence{}; /// Fence created on the presentation thread - bool is_srgb{}; /// Framebuffer is sRGB or RGB -}; - constexpr GLint PositionLocation = 0; constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; @@ -58,24 +42,6 @@ struct ScreenRectVertex { std::array<GLfloat, 2> tex_coord; }; -/// Returns true if any debug tool is attached -bool HasDebugTool() { - const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - if (nsight) { - return true; - } - - GLint num_extensions; - glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); - for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { - const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); - if (!std::strcmp(name, "GL_EXT_debug_tool")) { - return true; - } - } - return false; -} - /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -156,217 +122,62 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit break; } } - } // Anonymous namespace -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; - glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } - - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } -}; - -RendererOpenGL::RendererOpenGL(Core::System& system_, Core::Frontend::EmuWindow& emu_window_, - Tegra::GPU& gpu_, +RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window_, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, std::unique_ptr<Core::Frontend::GraphicsContext> context_) - : RendererBase{emu_window_, std::move(context_)}, system{system_}, - emu_window{emu_window_}, gpu{gpu_}, program_manager{device}, has_debug_tool{HasDebugTool()} {} + : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, + emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, state_tracker{gpu}, + program_manager{device}, + rasterizer(emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { + glEnable(GL_DEBUG_OUTPUT); + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); + glDebugMessageCallback(DebugHandler, nullptr); + } + AddTelemetryFields(); + InitOpenGLObjects(); +} RendererOpenGL::~RendererOpenGL() = default; -MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64)); -MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128)); - void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { if (!framebuffer) { return; } - PrepareRendertarget(framebuffer); RenderScreenshot(); - Frame* frame; - { - MICROPROFILE_SCOPE(OpenGL_WaitPresent); - - frame = frame_mailbox->GetRenderFrame(); + state_tracker.BindFramebuffer(0); + DrawScreen(emu_window.GetFramebufferLayout()); - // Clean up sync objects before drawing + ++m_current_frame; - // INTEL driver workaround. We can't delete the previous render sync object until we are - // sure that the presentation is done - if (frame->present_fence) { - glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - } - - // delete the draw fence if the frame wasn't presented - if (frame->render_fence) { - glDeleteSync(frame->render_fence); - frame->render_fence = 0; - } + rasterizer.TickFrame(); - // wait for the presentation to be done - if (frame->present_fence) { - glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - glDeleteSync(frame->present_fence); - frame->present_fence = 0; - } - } - - { - MICROPROFILE_SCOPE(OpenGL_RenderFrame); - const auto& layout = render_window.GetFramebufferLayout(); - - // Recreate the frame if the size of the window has changed - if (layout.width != frame->width || layout.height != frame->height || - screen_info.display_srgb != frame->is_srgb) { - LOG_DEBUG(Render_OpenGL, "Reloading render frame"); - frame->is_srgb = screen_info.display_srgb; - frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle); - DrawScreen(layout); - // Create a fence for the frontend to wait on and swap this frame to OffTex - frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - frame_mailbox->ReleaseRenderFrame(frame); - m_current_frame++; - rasterizer->TickFrame(); - } - - render_window.PollEvents(); - if (has_debug_tool) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); - Present(0); - context->SwapBuffers(); - } + context->SwapBuffers(); + render_window.OnFrameDisplayed(); } void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { - if (framebuffer) { - // If framebuffer is provided, reload it from memory to a texture - if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || - screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || - screen_info.texture.pixel_format != framebuffer->pixel_format || - gl_framebuffer_data.empty()) { - // Reallocate texture if the framebuffer size has changed. - // This is expected to not happen very often and hence should not be a - // performance problem. - ConfigureFramebufferTexture(screen_info.texture, *framebuffer); - } - - // Load the framebuffer from memory, draw it to the screen, and swap buffers - LoadFBToScreenInfo(*framebuffer); + if (!framebuffer) { + return; + } + // If framebuffer is provided, reload it from memory to a texture + if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || + screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || + screen_info.texture.pixel_format != framebuffer->pixel_format || + gl_framebuffer_data.empty()) { + // Reallocate texture if the framebuffer size has changed. + // This is expected to not happen very often and hence should not be a + // performance problem. + ConfigureFramebufferTexture(screen_info.texture, *framebuffer); } + + // Load the framebuffer from memory, draw it to the screen, and swap buffers + LoadFBToScreenInfo(*framebuffer); } void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) { @@ -375,26 +186,27 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf framebuffer_crop_rect = framebuffer.crop_rect; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; - if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { + if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { return; } // Reset the screen info's display texture to its own permanent texture screen_info.display_texture = screen_info.texture.resource.handle; - const auto pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; - const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; - const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; - u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)}; - rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes); - // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; - VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, - framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, - gl_framebuffer_data.data(), host_ptr); - + const auto pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::BytesPerBlock(pixel_format)}; + const u64 size_in_bytes{Tegra::Texture::CalculateSize( + true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)}; + const u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)}; + const std::span<const u8> input_data(host_ptr, size_in_bytes); + Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel, + framebuffer.width, framebuffer.height, 1, block_height_log2, + 0); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride)); // Update existing texture @@ -416,8 +228,6 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color } void RendererOpenGL::InitOpenGLObjects() { - frame_mailbox = std::make_unique<FrameMailbox>(); - glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(), Settings::values.bg_blue.GetValue(), 0.0f); @@ -435,6 +245,10 @@ void RendererOpenGL::InitOpenGLObjects() { glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle); + // Generate presentation sampler + present_sampler.Create(); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + // Generate VBO handle for drawing vertex_buffer.Create(); @@ -452,9 +266,15 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + // Enable seamless cubemaps when per texture parameters are not available + if (!GLAD_GL_ARB_seamless_cubemap_per_texture && !GLAD_GL_AMD_seamless_cubemap_per_texture) { + glEnable(GL_TEXTURE_CUBE_MAP_SEAMLESS); + } + // Enable unified vertex attributes and query vertex buffer address when the driver supports it if (device.HasVertexBufferUnifiedMemory()) { glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, @@ -471,19 +291,10 @@ void RendererOpenGL::AddTelemetryFields() { LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor); LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model); - auto& telemetry_session = system.TelemetrySession(); constexpr auto user_system = Common::Telemetry::FieldType::UserSystem; - telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor); - telemetry_session.AddField(user_system, "GPU_Model", gpu_model); - telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version); -} - -void RendererOpenGL::CreateRasterizer() { - if (rasterizer) { - return; - } - rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info, - program_manager, state_tracker); + telemetry_session.AddField(user_system, "GPU_Vendor", std::string(gpu_vendor)); + telemetry_session.AddField(user_system, "GPU_Model", std::string(gpu_model)); + telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, @@ -494,7 +305,7 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, const auto pixel_format{ VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; - const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::BytesPerBlock(pixel_format)}; gl_framebuffer_data.resize(texture.width * texture.height * bytes_per_pixel); GLint internal_format; @@ -513,8 +324,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; - UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", - static_cast<u32>(framebuffer.pixel_format)); + // UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", + // static_cast<u32>(framebuffer.pixel_format)); } texture.resource.Release(); @@ -546,7 +357,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } else { // Other transformations are unsupported LOG_CRITICAL(Render_OpenGL, "Unsupported framebuffer_transform_flags={}", - static_cast<u32>(framebuffer_transform_flags)); + framebuffer_transform_flags); UNIMPLEMENTED(); } } @@ -580,7 +391,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { state_tracker.NotifyPolygonModes(); state_tracker.NotifyViewport0(); state_tracker.NotifyScissor0(); - state_tracker.NotifyColorMask0(); + state_tracker.NotifyColorMask(0); state_tracker.NotifyBlend0(); state_tracker.NotifyFramebuffer(); state_tracker.NotifyFrontFace(); @@ -596,6 +407,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { program_manager.BindHostPipeline(pipeline.handle); + state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { glEnable(GL_FRAMEBUFFER_SRGB); @@ -614,7 +426,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glCullFace(GL_BACK); glFrontFace(GL_CW); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), static_cast<GLfloat>(layout.height)); glDepthRangeIndexed(0, 0.0, 0.0); @@ -638,7 +449,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } glBindTextureUnit(0, screen_info.display_texture); - glBindSampler(0, 0); + glBindSampler(0, present_sampler.handle); glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); @@ -646,51 +457,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { program_manager.RestoreGuestPipeline(); } -bool RendererOpenGL::TryPresent(int timeout_ms) { - if (has_debug_tool) { - LOG_DEBUG(Render_OpenGL, - "Skipping presentation because we are presenting on the main context"); - return false; - } - return Present(timeout_ms); -} - -bool RendererOpenGL::Present(int timeout_ms) { - const auto& layout = render_window.GetFramebufferLayout(); - auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms); - if (!frame) { - LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present"); - return false; - } - - // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a - // readback since we won't be doing any blending - glClear(GL_COLOR_BUFFER_BIT); - - // Recreate the presentation FBO if the color attachment was changed - if (frame->color_reloaded) { - LOG_DEBUG(Render_OpenGL, "Reloading present frame"); - frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height); - } - glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED); - // INTEL workaround. - // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete - // it on the emulation thread without too much penalty - // glDeleteSync(frame.render_sync); - // frame.render_sync = 0; - - glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle); - glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height, - GL_COLOR_BUFFER_BIT, GL_LINEAR); - - // Insert fence for the main thread to block on - frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - - glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); - return true; -} - void RendererOpenGL::RenderScreenshot() { if (!renderer_settings.screenshot_requested) { return; @@ -705,7 +471,7 @@ void RendererOpenGL::RenderScreenshot() { screenshot_framebuffer.Create(); glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); - Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; + const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; GLuint renderbuffer; glGenRenderbuffers(1, &renderbuffer); @@ -716,6 +482,8 @@ void RendererOpenGL::RenderScreenshot() { DrawScreen(layout); + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + glPixelStorei(GL_PACK_ROW_LENGTH, 0); glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, renderer_settings.screenshot_bits); @@ -729,25 +497,4 @@ void RendererOpenGL::RenderScreenshot() { renderer_settings.screenshot_requested = false; } -bool RendererOpenGL::Init() { - if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { - glEnable(GL_DEBUG_OUTPUT); - glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); - glDebugMessageCallback(DebugHandler, nullptr); - } - - AddTelemetryFields(); - - if (!GLAD_GL_VERSION_4_3) { - return false; - } - - InitOpenGLObjects(); - CreateRasterizer(); - - return true; -} - -void RendererOpenGL::ShutDown() {} - } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 52ea76b7d..cc19a110f 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,22 +10,32 @@ #include "common/math_util.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" namespace Core { class System; -} +class TelemetrySession; +} // namespace Core namespace Core::Frontend { class EmuWindow; } +namespace Core::Memory { +class Memory; +} + namespace Layout { struct FramebufferLayout; } +namespace Tegra { +class GPU; +} + namespace OpenGL { /// Structure used for storing information about the textures for the Switch screen @@ -46,25 +56,19 @@ struct ScreenInfo { TextureInfo texture; }; -struct PresentationTexture { - u32 width = 0; - u32 height = 0; - OGLTexture texture; -}; - -class FrameMailbox; - class RendererOpenGL final : public VideoCore::RendererBase { public: - explicit RendererOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - Tegra::GPU& gpu, - std::unique_ptr<Core::Frontend::GraphicsContext> context); + explicit RendererOpenGL(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window_, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_); ~RendererOpenGL() override; - bool Init() override; - void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - bool TryPresent(int timeout_ms) override; + + VideoCore::RasterizerInterface* ReadRasterizer() override { + return &rasterizer; + } private: /// Initializes the OpenGL state and creates persistent objects. @@ -72,8 +76,6 @@ private: void AddTelemetryFields(); - void CreateRasterizer(); - void ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer); @@ -92,16 +94,18 @@ private: void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer); - bool Present(int timeout_ms); - - Core::System& system; + Core::TelemetrySession& telemetry_session; Core::Frontend::EmuWindow& emu_window; + Core::Memory::Memory& cpu_memory; Tegra::GPU& gpu; - const Device device; - StateTracker state_tracker{system}; + Device device; + StateTracker state_tracker; + ProgramManager program_manager; + RasterizerOpenGL rasterizer; // OpenGL object IDs + OGLSampler present_sampler; OGLBuffer vertex_buffer; OGLProgram vertex_program; OGLProgram fragment_program; @@ -114,20 +118,12 @@ private: /// Display information for Switch screen ScreenInfo screen_info; - /// Global dummy shader pipeline - ProgramManager program_manager; - /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; /// Used for transforming the framebuffer orientation Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{}; Common::Rectangle<int> framebuffer_crop_rect; - - /// Frame presentation mailbox - std::unique_ptr<FrameMailbox> frame_mailbox; - - bool has_debug_tool = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp new file mode 100644 index 000000000..31ec68505 --- /dev/null +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -0,0 +1,225 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <bit> +#include <span> +#include <string_view> + +#include <glad/glad.h> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" +#include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" +#include "video_core/host_shaders/opengl_copy_bc4_comp.h" +#include "video_core/host_shaders/pitch_unswizzle_comp.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" +#include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/accelerated_swizzle.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/decoders.h" + +namespace OpenGL { + +using namespace HostShaders; + +using VideoCommon::Extent3D; +using VideoCommon::ImageCopy; +using VideoCommon::ImageType; +using VideoCommon::SwizzleParameters; +using VideoCommon::Accelerated::MakeBlockLinearSwizzle2DParams; +using VideoCommon::Accelerated::MakeBlockLinearSwizzle3DParams; +using VideoCore::Surface::BytesPerBlock; + +namespace { + +OGLProgram MakeProgram(std::string_view source) { + OGLShader shader; + shader.Create(source, GL_COMPUTE_SHADER); + + OGLProgram program; + program.Create(true, false, shader.handle); + return program; +} + +} // Anonymous namespace + +UtilShaders::UtilShaders(ProgramManager& program_manager_) + : program_manager{program_manager_}, + block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), + block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), + pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), + copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { + const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); + swizzle_table_buffer.Create(); + glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); +} + +UtilShaders::~UtilShaders() = default; + +void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + + program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + + const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + map.offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(0, 1, params.origin.data()); + glUniform3iv(1, 1, params.destination.data()); + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.layer_stride); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, store_format); + glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; + + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); + program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + + const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + map.offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + const u32 num_dispatches_z = Common::DivCeil(num_tiles.depth, WORKGROUP_SIZE.depth); + + const auto params = MakeBlockLinearSwizzle3DParams(swizzle, image.info); + glUniform3uiv(0, 1, params.origin.data()); + glUniform3iv(1, 1, params.destination.data()); + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.slice_size); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + glUniform1ui(8, params.block_depth); + glUniform1ui(9, params.block_depth_mask); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, store_format); + glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; + static constexpr GLuint BINDING_INPUT_BUFFER = 0; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + static constexpr GLuint LOC_ORIGIN = 0; + static constexpr GLuint LOC_DESTINATION = 1; + static constexpr GLuint LOC_BYTES_PER_BLOCK = 2; + static constexpr GLuint LOC_PITCH = 3; + + const u32 bytes_per_block = BytesPerBlock(image.info.format); + const GLenum format = StoreFormat(bytes_per_block); + const u32 pitch = image.info.pitch; + + UNIMPLEMENTED_IF_MSG(!std::has_single_bit(bytes_per_block), + "Non-power of two images are not implemented"); + + program_manager.BindHostCompute(pitch_unswizzle_program.handle); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); + glUniform2ui(LOC_ORIGIN, 0, 0); + glUniform2i(LOC_DESTINATION, 0, 0); + glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); + glUniform1ui(LOC_PITCH, pitch); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, + format); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + map.offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); + glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const ImageCopy> copies) { + static constexpr GLuint BINDING_INPUT_IMAGE = 0; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 1; + static constexpr GLuint LOC_SRC_OFFSET = 0; + static constexpr GLuint LOC_DST_OFFSET = 1; + + program_manager.BindHostCompute(copy_bc4_program.handle); + + for (const ImageCopy& copy : copies) { + ASSERT(copy.src_subresource.base_layer == 0); + ASSERT(copy.src_subresource.num_layers == 1); + ASSERT(copy.dst_subresource.base_layer == 0); + ASSERT(copy.dst_subresource.num_layers == 1); + + glUniform3ui(LOC_SRC_OFFSET, copy.src_offset.x, copy.src_offset.y, copy.src_offset.z); + glUniform3ui(LOC_DST_OFFSET, copy.dst_offset.x, copy.dst_offset.y, copy.dst_offset.z); + glBindImageTexture(BINDING_INPUT_IMAGE, src_image.StorageHandle(), + copy.src_subresource.base_level, GL_FALSE, 0, GL_READ_ONLY, GL_RG32UI); + glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.StorageHandle(), + copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); + glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth); + } + program_manager.RestoreGuestCompute(); +} + +GLenum StoreFormat(u32 bytes_per_block) { + switch (bytes_per_block) { + case 1: + return GL_R8UI; + case 2: + return GL_R16UI; + case 4: + return GL_R32UI; + case 8: + return GL_RG32UI; + case 16: + return GL_RGBA32UI; + } + UNREACHABLE(); + return GL_R8UI; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h new file mode 100644 index 000000000..7b1d16b09 --- /dev/null +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -0,0 +1,52 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <span> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/texture_cache/types.h" + +namespace OpenGL { + +class Image; +class ProgramManager; + +struct ImageBufferMap; + +class UtilShaders { +public: + explicit UtilShaders(ProgramManager& program_manager); + ~UtilShaders(); + + void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void PitchUpload(Image& image, const ImageBufferMap& map, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void CopyBC4(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies); + +private: + ProgramManager& program_manager; + + OGLBuffer swizzle_table_buffer; + + OGLProgram block_linear_unswizzle_2d_program; + OGLProgram block_linear_unswizzle_3d_program; + OGLProgram pitch_unswizzle_program; + OGLProgram copy_bc4_program; +}; + +GLenum StoreFormat(u32 bytes_per_block); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp deleted file mode 100644 index 6d7bb16b2..000000000 --- a/src/video_core/renderer_opengl/utils.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <string> -#include <vector> - -#include <fmt/format.h> -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/utils.h" - -namespace OpenGL { - -void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) { - if (!GLAD_GL_KHR_debug) { - // We don't need to throw an error as this is just for debugging - return; - } - - std::string object_label; - if (extra_info.empty()) { - switch (identifier) { - case GL_TEXTURE: - object_label = fmt::format("Texture@0x{:016X}", addr); - break; - case GL_PROGRAM: - object_label = fmt::format("Shader@0x{:016X}", addr); - break; - default: - object_label = fmt::format("Object(0x{:X})@0x{:016X}", identifier, addr); - break; - } - } else { - object_label = fmt::format("{}@0x{:016X}", extra_info, addr); - } - glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str())); -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h deleted file mode 100644 index 9c09ee12c..000000000 --- a/src/video_core/renderer_opengl/utils.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <string_view> -#include <vector> -#include <glad/glad.h> -#include "common/common_types.h" - -namespace OpenGL { - -void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {}); - -} // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp new file mode 100644 index 000000000..1f6a169ae --- /dev/null +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -0,0 +1,624 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> + +#include "video_core/host_shaders/convert_depth_to_float_frag_spv.h" +#include "video_core/host_shaders/convert_float_to_depth_frag_spv.h" +#include "video_core/host_shaders/full_screen_triangle_vert_spv.h" +#include "video_core/host_shaders/vulkan_blit_color_float_frag_spv.h" +#include "video_core/host_shaders/vulkan_blit_depth_stencil_frag_spv.h" +#include "video_core/renderer_vulkan/blit_image.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +using VideoCommon::ImageViewType; + +namespace { +struct PushConstants { + std::array<float, 2> tex_scale; + std::array<float, 2> tex_offset; +}; + +template <u32 binding> +inline constexpr VkDescriptorSetLayoutBinding TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING{ + .binding = binding, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = nullptr, +}; +constexpr std::array TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS{ + TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<0>, + TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<1>, +}; +constexpr VkDescriptorSetLayoutCreateInfo ONE_TEXTURE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = 1, + .pBindings = &TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<0>, +}; +constexpr VkDescriptorSetLayoutCreateInfo TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast<u32>(TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS.size()), + .pBindings = TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS.data(), +}; +constexpr VkPushConstantRange PUSH_CONSTANT_RANGE{ + .stageFlags = VK_SHADER_STAGE_VERTEX_BIT, + .offset = 0, + .size = sizeof(PushConstants), +}; +constexpr VkPipelineVertexInputStateCreateInfo PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .vertexBindingDescriptionCount = 0, + .pVertexBindingDescriptions = nullptr, + .vertexAttributeDescriptionCount = 0, + .pVertexAttributeDescriptions = nullptr, +}; +constexpr VkPipelineInputAssemblyStateCreateInfo PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + .primitiveRestartEnable = VK_FALSE, +}; +constexpr VkPipelineViewportStateCreateInfo PIPELINE_VIEWPORT_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr, +}; +constexpr VkPipelineRasterizationStateCreateInfo PIPELINE_RASTERIZATION_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_BACK_BIT, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f, +}; +constexpr VkPipelineMultisampleStateCreateInfo PIPELINE_MULTISAMPLE_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 0.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE, +}; +constexpr std::array DYNAMIC_STATES{ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, +}; +constexpr VkPipelineDynamicStateCreateInfo PIPELINE_DYNAMIC_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .dynamicStateCount = static_cast<u32>(DYNAMIC_STATES.size()), + .pDynamicStates = DYNAMIC_STATES.data(), +}; +constexpr VkPipelineColorBlendStateCreateInfo PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 0, + .pAttachments = nullptr, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, +}; +constexpr VkPipelineColorBlendAttachmentState PIPELINE_COLOR_BLEND_ATTACHMENT_STATE{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, +}; +constexpr VkPipelineColorBlendStateCreateInfo PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 1, + .pAttachments = &PIPELINE_COLOR_BLEND_ATTACHMENT_STATE, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, +}; +constexpr VkPipelineDepthStencilStateCreateInfo PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE, + .front = VkStencilOpState{}, + .back = VkStencilOpState{}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 0.0f, +}; + +template <VkFilter filter> +inline constexpr VkSamplerCreateInfo SAMPLER_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .magFilter = filter, + .minFilter = filter, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 0.0f, + .compareEnable = VK_FALSE, + .compareOp = VK_COMPARE_OP_NEVER, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE, + .unnormalizedCoordinates = VK_TRUE, +}; + +constexpr VkPipelineLayoutCreateInfo PipelineLayoutCreateInfo( + const VkDescriptorSetLayout* set_layout) { + return VkPipelineLayoutCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &PUSH_CONSTANT_RANGE, + }; +} + +constexpr VkPipelineShaderStageCreateInfo PipelineShaderStageCreateInfo(VkShaderStageFlagBits stage, + VkShaderModule shader) { + return VkPipelineShaderStageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = stage, + .module = shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }; +} + +constexpr std::array<VkPipelineShaderStageCreateInfo, 2> MakeStages( + VkShaderModule vertex_shader, VkShaderModule fragment_shader) { + return std::array{ + PipelineShaderStageCreateInfo(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader), + PipelineShaderStageCreateInfo(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader), + }; +} + +void UpdateOneTextureDescriptorSet(const Device& device, VkDescriptorSet descriptor_set, + VkSampler sampler, VkImageView image_view) { + const VkDescriptorImageInfo image_info{ + .sampler = sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkWriteDescriptorSet write_descriptor_set{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + device.GetLogical().UpdateDescriptorSets(write_descriptor_set, nullptr); +} + +void UpdateTwoTexturesDescriptorSet(const Device& device, VkDescriptorSet descriptor_set, + VkSampler sampler, VkImageView image_view_0, + VkImageView image_view_1) { + const VkDescriptorImageInfo image_info_0{ + .sampler = sampler, + .imageView = image_view_0, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkDescriptorImageInfo image_info_1{ + .sampler = sampler, + .imageView = image_view_1, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const std::array write_descriptor_sets{ + VkWriteDescriptorSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info_0, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }, + VkWriteDescriptorSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info_1, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }, + }; + device.GetLogical().UpdateDescriptorSets(write_descriptor_sets, nullptr); +} + +void BindBlitState(vk::CommandBuffer cmdbuf, VkPipelineLayout layout, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region) { + const VkOffset2D offset{ + .x = std::min(dst_region[0].x, dst_region[1].x), + .y = std::min(dst_region[0].y, dst_region[1].y), + }; + const VkExtent2D extent{ + .width = static_cast<u32>(std::abs(dst_region[1].x - dst_region[0].x)), + .height = static_cast<u32>(std::abs(dst_region[1].y - dst_region[0].y)), + }; + const VkViewport viewport{ + .x = static_cast<float>(offset.x), + .y = static_cast<float>(offset.y), + .width = static_cast<float>(extent.width), + .height = static_cast<float>(extent.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + // TODO: Support scissored blits + const VkRect2D scissor{ + .offset = offset, + .extent = extent, + }; + const float scale_x = static_cast<float>(src_region[1].x - src_region[0].x); + const float scale_y = static_cast<float>(src_region[1].y - src_region[0].y); + const PushConstants push_constants{ + .tex_scale = {scale_x, scale_y}, + .tex_offset = {static_cast<float>(src_region[0].x), static_cast<float>(src_region[0].y)}, + }; + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_VERTEX_BIT, push_constants); +} + +} // Anonymous namespace + +BlitImageHelper::BlitImageHelper(const Device& device_, VKScheduler& scheduler_, + StateTracker& state_tracker_, VKDescriptorPool& descriptor_pool) + : device{device_}, scheduler{scheduler_}, state_tracker{state_tracker_}, + one_texture_set_layout(device.GetLogical().CreateDescriptorSetLayout( + ONE_TEXTURE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO)), + two_textures_set_layout(device.GetLogical().CreateDescriptorSetLayout( + TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_CREATE_INFO)), + one_texture_descriptor_allocator(descriptor_pool, *one_texture_set_layout), + two_textures_descriptor_allocator(descriptor_pool, *two_textures_set_layout), + one_texture_pipeline_layout(device.GetLogical().CreatePipelineLayout( + PipelineLayoutCreateInfo(one_texture_set_layout.address()))), + two_textures_pipeline_layout(device.GetLogical().CreatePipelineLayout( + PipelineLayoutCreateInfo(two_textures_set_layout.address()))), + full_screen_vert(BuildShader(device, FULL_SCREEN_TRIANGLE_VERT_SPV)), + blit_color_to_color_frag(BuildShader(device, VULKAN_BLIT_COLOR_FLOAT_FRAG_SPV)), + convert_depth_to_float_frag(BuildShader(device, CONVERT_DEPTH_TO_FLOAT_FRAG_SPV)), + convert_float_to_depth_frag(BuildShader(device, CONVERT_FLOAT_TO_DEPTH_FRAG_SPV)), + linear_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_LINEAR>)), + nearest_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_NEAREST>)) { + if (device.IsExtShaderStencilExportSupported()) { + blit_depth_stencil_frag = BuildShader(device, VULKAN_BLIT_DEPTH_STENCIL_FRAG_SPV); + } +} + +BlitImageHelper::~BlitImageHelper() = default; + +void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + const bool is_linear = filter == Tegra::Engines::Fermi2D::Filter::Bilinear; + const BlitImagePipelineKey key{ + .renderpass = dst_framebuffer->RenderPass(), + .operation = operation, + }; + const VkPipelineLayout layout = *one_texture_pipeline_layout; + const VkImageView src_view = src_image_view.Handle(ImageViewType::e2D); + const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; + const VkPipeline pipeline = FindOrEmplacePipeline(key); + const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_view, descriptor_set, + &device = device](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + BindBlitState(cmdbuf, layout, dst_region, src_region); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, + VkImageView src_depth_view, VkImageView src_stencil_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); + ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); + + const VkPipelineLayout layout = *two_textures_pipeline_layout; + const VkSampler sampler = *nearest_sampler; + const VkPipeline pipeline = BlitDepthStencilPipeline(dst_framebuffer->RenderPass()); + const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit(); + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view, + src_stencil_view, descriptor_set, + &device = device](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view, + src_stencil_view); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + BindBlitState(cmdbuf, layout, dst_region, src_region); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +void BlitImageHelper::ConvertD32ToR32(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertDepthToColorPipeline(convert_d32_to_r32_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertR32ToD32(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + + ConvertColorToDepthPipeline(convert_r32_to_d32_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertD16ToR16(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertDepthToColorPipeline(convert_d16_to_r16_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertR16ToD16(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertColorToDepthPipeline(convert_r16_to_d16_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + const VkPipelineLayout layout = *one_texture_pipeline_layout; + const VkImageView src_view = src_image_view.Handle(ImageViewType::e2D); + const VkSampler sampler = *nearest_sampler; + const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); + const VkExtent2D extent{ + .width = src_image_view.size.width, + .height = src_image_view.size.height, + }; + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([pipeline, layout, sampler, src_view, descriptor_set, extent, + &device = device](vk::CommandBuffer cmdbuf) { + const VkOffset2D offset{ + .x = 0, + .y = 0, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(extent.width), + .height = static_cast<float>(extent.height), + .minDepth = 0.0f, + .maxDepth = 0.0f, + }; + const VkRect2D scissor{ + .offset = offset, + .extent = extent, + }; + const PushConstants push_constants{ + .tex_scale = {viewport.width, viewport.height}, + .tex_offset = {0.0f, 0.0f}, + }; + UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); + + // TODO: Barriers + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_VERTEX_BIT, push_constants); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& key) { + const auto it = std::ranges::find(blit_color_keys, key); + if (it != blit_color_keys.end()) { + return *blit_color_pipelines[std::distance(blit_color_keys.begin(), it)]; + } + blit_color_keys.push_back(key); + + const std::array stages = MakeStages(*full_screen_vert, *blit_color_to_color_frag); + const VkPipelineColorBlendAttachmentState blend_attachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + // TODO: programmable blending + const VkPipelineColorBlendStateCreateInfo color_blend_create_info{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 1, + .pAttachments = &blend_attachment, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, + }; + blit_color_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_create_info, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = key.renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + })); + return *blit_color_pipelines.back(); +} + +VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) { + if (blit_depth_stencil_pipeline) { + return *blit_depth_stencil_pipeline; + } + const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag); + blit_depth_stencil_pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *two_textures_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); + return *blit_depth_stencil_pipeline; +} + +void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { + if (pipeline) { + return; + } + const std::array stages = MakeStages(*full_screen_vert, *convert_depth_to_float_frag); + pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = nullptr, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); +} + +void BlitImageHelper::ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { + if (pipeline) { + return; + } + const std::array stages = MakeStages(*full_screen_vert, *convert_float_to_depth_frag); + pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h new file mode 100644 index 000000000..43fd3d737 --- /dev/null +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -0,0 +1,96 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <compare> + +#include "video_core/engines/fermi_2d.h" +#include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/texture_cache/types.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +using VideoCommon::Offset2D; + +class Device; +class Framebuffer; +class ImageView; +class StateTracker; +class VKScheduler; + +struct BlitImagePipelineKey { + constexpr auto operator<=>(const BlitImagePipelineKey&) const noexcept = default; + + VkRenderPass renderpass; + Tegra::Engines::Fermi2D::Operation operation; +}; + +class BlitImageHelper { +public: + explicit BlitImageHelper(const Device& device, VKScheduler& scheduler, + StateTracker& state_tracker, VKDescriptorPool& descriptor_pool); + ~BlitImageHelper(); + + void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void BlitDepthStencil(const Framebuffer* dst_framebuffer, VkImageView src_depth_view, + VkImageView src_stencil_view, const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + +private: + void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, + const ImageView& src_image_view); + + [[nodiscard]] VkPipeline FindOrEmplacePipeline(const BlitImagePipelineKey& key); + + [[nodiscard]] VkPipeline BlitDepthStencilPipeline(VkRenderPass renderpass); + + void ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); + + void ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); + + const Device& device; + VKScheduler& scheduler; + StateTracker& state_tracker; + + vk::DescriptorSetLayout one_texture_set_layout; + vk::DescriptorSetLayout two_textures_set_layout; + DescriptorAllocator one_texture_descriptor_allocator; + DescriptorAllocator two_textures_descriptor_allocator; + vk::PipelineLayout one_texture_pipeline_layout; + vk::PipelineLayout two_textures_pipeline_layout; + vk::ShaderModule full_screen_vert; + vk::ShaderModule blit_color_to_color_frag; + vk::ShaderModule blit_depth_stencil_frag; + vk::ShaderModule convert_depth_to_float_frag; + vk::ShaderModule convert_float_to_depth_frag; + vk::Sampler linear_sampler; + vk::Sampler nearest_sampler; + + std::vector<BlitImagePipelineKey> blit_color_keys; + std::vector<vk::Pipeline> blit_color_pipelines; + vk::Pipeline blit_depth_stencil_pipeline; + vk::Pipeline convert_d32_to_r32_pipeline; + vk::Pipeline convert_r32_to_d32_pipeline; + vk::Pipeline convert_d16_to_r16_pipeline; + vk::Pipeline convert_r16_to_d16_pipeline; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 81a39a3b8..362278f01 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -8,17 +8,19 @@ #include <boost/functional/hash.hpp> +#include "common/bit_cast.h" #include "common/cityhash.h" #include "common/common_types.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" namespace Vulkan { namespace { -constexpr std::size_t POINT = 0; -constexpr std::size_t LINE = 1; -constexpr std::size_t POLYGON = 2; +constexpr size_t POINT = 0; +constexpr size_t LINE = 1; +constexpr size_t POLYGON = 2; constexpr std::array POLYGON_OFFSET_ENABLE_LUT = { POINT, // Points LINE, // Lines @@ -39,13 +41,17 @@ constexpr std::array POLYGON_OFFSET_ENABLE_LUT = { } // Anonymous namespace -void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_state) { - const std::array enabled_lut = {regs.polygon_offset_point_enable, - regs.polygon_offset_line_enable, - regs.polygon_offset_fill_enable}; +void FixedPipelineState::Refresh(Tegra::Engines::Maxwell3D& maxwell3d, + bool has_extended_dynamic_state) { + const Maxwell& regs = maxwell3d.regs; + const std::array enabled_lut{ + regs.polygon_offset_point_enable, + regs.polygon_offset_line_enable, + regs.polygon_offset_fill_enable, + }; const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); - raw = 0; + raw1 = 0; primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0); depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0); depth_clamp_disabled.Assign(regs.view_volume_clip_control.depth_clamp_disabled.Value()); @@ -58,40 +64,58 @@ void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_sta logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0); logic_op.Assign(PackLogicOp(regs.logic_op.operation)); rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0); + topology.Assign(regs.draw.topology); + msaa_mode.Assign(regs.multisample_mode); - std::memcpy(&point_size, ®s.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast - - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - binding_divisors[index] = - regs.instanced_arrays.IsInstancingEnabled(index) ? regs.vertex_array[index].divisor : 0; + raw2 = 0; + const auto test_func = + regs.alpha_test_enabled != 0 ? regs.alpha_test_func : Maxwell::ComparisonOp::Always; + alpha_test_func.Assign(PackComparisonOp(test_func)); + early_z.Assign(regs.force_early_fragment_tests != 0 ? 1 : 0); + + alpha_test_ref = Common::BitCast<u32>(regs.alpha_test_ref); + point_size = Common::BitCast<u32>(regs.point_size); + + if (maxwell3d.dirty.flags[Dirty::InstanceDivisors]) { + maxwell3d.dirty.flags[Dirty::InstanceDivisors] = false; + for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + const bool is_enabled = regs.instanced_arrays.IsInstancingEnabled(index); + binding_divisors[index] = is_enabled ? regs.vertex_array[index].divisor : 0; + } } - - for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { - const auto& input = regs.vertex_attrib_format[index]; - auto& attribute = attributes[index]; - attribute.raw = 0; - attribute.enabled.Assign(input.IsConstant() ? 0 : 1); - attribute.buffer.Assign(input.buffer); - attribute.offset.Assign(input.offset); - attribute.type.Assign(static_cast<u32>(input.type.Value())); - attribute.size.Assign(static_cast<u32>(input.size.Value())); + if (maxwell3d.dirty.flags[Dirty::VertexAttributes]) { + maxwell3d.dirty.flags[Dirty::VertexAttributes] = false; + for (size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { + const auto& input = regs.vertex_attrib_format[index]; + auto& attribute = attributes[index]; + attribute.raw = 0; + attribute.enabled.Assign(input.IsConstant() ? 0 : 1); + attribute.buffer.Assign(input.buffer); + attribute.offset.Assign(input.offset); + attribute.type.Assign(static_cast<u32>(input.type.Value())); + attribute.size.Assign(static_cast<u32>(input.size.Value())); + } } - - for (std::size_t index = 0; index < std::size(attachments); ++index) { - attachments[index].Fill(regs, index); + if (maxwell3d.dirty.flags[Dirty::Blending]) { + maxwell3d.dirty.flags[Dirty::Blending] = false; + for (size_t index = 0; index < attachments.size(); ++index) { + attachments[index].Refresh(regs, index); + } + } + if (maxwell3d.dirty.flags[Dirty::ViewportSwizzles]) { + maxwell3d.dirty.flags[Dirty::ViewportSwizzles] = false; + const auto& transform = regs.viewport_transform; + std::ranges::transform(transform, viewport_swizzles.begin(), [](const auto& viewport) { + return static_cast<u16>(viewport.swizzle.raw); + }); } - - const auto& transform = regs.viewport_transform; - std::transform(transform.begin(), transform.end(), viewport_swizzles.begin(), - [](const auto& viewport) { return static_cast<u16>(viewport.swizzle.raw); }); - if (!has_extended_dynamic_state) { no_extended_dynamic_state.Assign(1); - dynamic_state.Fill(regs); + dynamic_state.Refresh(regs); } } -void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size_t index) { +void FixedPipelineState::BlendingAttachment::Refresh(const Maxwell& regs, size_t index) { const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : index]; raw = 0; @@ -130,8 +154,7 @@ void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size enable.Assign(1); } -void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) { - const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); +void FixedPipelineState::DynamicState::Refresh(const Maxwell& regs) { u32 packed_front_face = PackFrontFace(regs.front_face); if (regs.screen_y_control.triangle_rast_flip != 0) { // Flip front face @@ -161,22 +184,16 @@ void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) { depth_test_enable.Assign(regs.depth_test_enable); front_face.Assign(packed_front_face); depth_test_func.Assign(PackComparisonOp(regs.depth_test_func)); - topology.Assign(topology_index); cull_face.Assign(PackCullFace(regs.cull_face)); cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0); - - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& input = regs.vertex_array[index]; - VertexBinding& binding = vertex_bindings[index]; - binding.raw = 0; - binding.enabled.Assign(input.IsEnabled() ? 1 : 0); - binding.stride.Assign(static_cast<u16>(input.stride.Value())); - } + std::ranges::transform(regs.vertex_array, vertex_strides.begin(), [](const auto& array) { + return static_cast<u16>(array.stride.Value()); + }); } -std::size_t FixedPipelineState::Hash() const noexcept { +size_t FixedPipelineState::Hash() const noexcept { const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size()); - return static_cast<std::size_t>(hash); + return static_cast<size_t>(hash); } bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept { diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h index cdcbb65f5..a0eb83a68 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h @@ -58,7 +58,7 @@ struct FixedPipelineState { BitField<30, 1, u32> enable; }; - void Fill(const Maxwell& regs, std::size_t index); + void Refresh(const Maxwell& regs, size_t index); constexpr std::array<bool, 4> Mask() const noexcept { return {mask_r != 0, mask_g != 0, mask_b != 0, mask_a != 0}; @@ -106,7 +106,7 @@ struct FixedPipelineState { } }; - template <std::size_t Position> + template <size_t Position> union StencilFace { BitField<Position + 0, 3, u32> action_stencil_fail; BitField<Position + 3, 3, u32> action_depth_fail; @@ -130,12 +130,6 @@ struct FixedPipelineState { } }; - union VertexBinding { - u16 raw; - BitField<0, 12, u16> stride; - BitField<12, 1, u16> enabled; - }; - struct DynamicState { union { u32 raw1; @@ -150,13 +144,13 @@ struct FixedPipelineState { }; union { u32 raw2; - BitField<0, 4, u32> topology; - BitField<4, 2, u32> cull_face; - BitField<6, 1, u32> cull_enable; + BitField<0, 2, u32> cull_face; + BitField<2, 1, u32> cull_enable; }; - std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings; + // Vertex stride is a 12 bits value, we have 4 bits to spare per element + std::array<u16, Maxwell::NumVertexArrays> vertex_strides; - void Fill(const Maxwell& regs); + void Refresh(const Maxwell& regs); Maxwell::ComparisonOp DepthTestFunc() const noexcept { return UnpackComparisonOp(depth_test_func); @@ -169,14 +163,10 @@ struct FixedPipelineState { Maxwell::FrontFace FrontFace() const noexcept { return UnpackFrontFace(front_face.Value()); } - - constexpr Maxwell::PrimitiveTopology Topology() const noexcept { - return static_cast<Maxwell::PrimitiveTopology>(topology.Value()); - } }; union { - u32 raw; + u32 raw1; BitField<0, 1, u32> no_extended_dynamic_state; BitField<2, 1, u32> primitive_restart_enable; BitField<3, 1, u32> depth_bias_enable; @@ -190,7 +180,16 @@ struct FixedPipelineState { BitField<18, 1, u32> logic_op_enable; BitField<19, 4, u32> logic_op; BitField<23, 1, u32> rasterize_enable; + BitField<24, 4, Maxwell::PrimitiveTopology> topology; + BitField<28, 4, Tegra::Texture::MsaaMode> msaa_mode; + }; + union { + u32 raw2; + BitField<0, 3, u32> alpha_test_func; + BitField<3, 1, u32> early_z; }; + + u32 alpha_test_ref; u32 point_size; std::array<u32, Maxwell::NumVertexArrays> binding_divisors; std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes; @@ -198,9 +197,9 @@ struct FixedPipelineState { std::array<u16, Maxwell::NumViewports> viewport_swizzles; DynamicState dynamic_state; - void Fill(const Maxwell& regs, bool has_extended_dynamic_state); + void Refresh(Tegra::Engines::Maxwell3D& maxwell3d, bool has_extended_dynamic_state); - std::size_t Hash() const noexcept; + size_t Hash() const noexcept; bool operator==(const FixedPipelineState& rhs) const noexcept; @@ -208,8 +207,8 @@ struct FixedPipelineState { return !operator==(rhs); } - std::size_t Size() const noexcept { - const std::size_t total_size = sizeof *this; + size_t Size() const noexcept { + const size_t total_size = sizeof *this; return total_size - (no_extended_dynamic_state != 0 ? 0 : sizeof(DynamicState)); } }; @@ -223,7 +222,7 @@ namespace std { template <> struct hash<Vulkan::FixedPipelineState> { - std::size_t operator()(const Vulkan::FixedPipelineState& k) const noexcept { + size_t operator()(const Vulkan::FixedPipelineState& k) const noexcept { return k.Hash(); } }; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index f8c77f4fa..19aaf034f 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -9,9 +9,9 @@ #include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::MaxwellToVK { @@ -26,7 +26,7 @@ VkFilter Filter(Tegra::Texture::TextureFilter filter) { case Tegra::Texture::TextureFilter::Linear: return VK_FILTER_LINEAR; } - UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter)); + UNREACHABLE_MSG("Invalid sampler filter={}", filter); return {}; } @@ -43,11 +43,11 @@ VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter case Tegra::Texture::TextureMipmapFilter::Linear: return VK_SAMPLER_MIPMAP_MODE_LINEAR; } - UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); + UNREACHABLE_MSG("Invalid sampler mipmap mode={}", mipmap_filter); return {}; } -VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, +VkSamplerAddressMode WrapMode(const Device& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter) { switch (wrap_mode) { case Tegra::Texture::WrapMode::Wrap: @@ -78,9 +78,10 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w case Tegra::Texture::WrapMode::MirrorOnceBorder: UNIMPLEMENTED(); return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; + default: + UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", wrap_mode); + return {}; } - UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); - return {}; } VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) { @@ -102,16 +103,15 @@ VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_ case Tegra::Texture::DepthCompareFunc::Always: return VK_COMPARE_OP_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented sampler depth compare function={}", - static_cast<u32>(depth_compare_func)); + UNIMPLEMENTED_MSG("Unimplemented sampler depth compare function={}", depth_compare_func); return {}; } } // namespace Sampler namespace { - -enum : u32 { Attachable = 1, Storage = 2 }; +constexpr u32 Attachable = 1 << 0; +constexpr u32 Storage = 1 << 1; struct FormatTuple { VkFormat format; ///< Vulkan format @@ -122,7 +122,7 @@ struct FormatTuple { {VK_FORMAT_A8B8G8R8_SINT_PACK32, Attachable | Storage}, // A8B8G8R8_SINT {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage}, // A8B8G8R8_UINT {VK_FORMAT_R5G6B5_UNORM_PACK16, Attachable}, // R5G6B5_UNORM - {VK_FORMAT_B5G6R5_UNORM_PACK16, Attachable}, // B5G6R5_UNORM + {VK_FORMAT_B5G6R5_UNORM_PACK16}, // B5G6R5_UNORM {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1R5G5B5_UNORM {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage}, // A2B10G10R10_UINT @@ -163,7 +163,7 @@ struct FormatTuple { {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // R16G16_UNORM {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // R16G16_FLOAT {VK_FORMAT_UNDEFINED}, // R16G16_UINT - {VK_FORMAT_UNDEFINED}, // R16G16_SINT + {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // A8B8G8R8_SRGB @@ -222,30 +222,36 @@ constexpr bool IsZetaFormat(PixelFormat pixel_format) { } // Anonymous namespace -FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format) { - ASSERT(static_cast<std::size_t>(pixel_format) < std::size(tex_format_tuples)); - - auto tuple = tex_format_tuples[static_cast<std::size_t>(pixel_format)]; +FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with_srgb, + PixelFormat pixel_format) { + ASSERT(static_cast<size_t>(pixel_format) < std::size(tex_format_tuples)); + FormatTuple tuple = tex_format_tuples[static_cast<size_t>(pixel_format)]; if (tuple.format == VK_FORMAT_UNDEFINED) { - UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", - static_cast<u32>(pixel_format)); - return {VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true}; + UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", pixel_format); + return FormatInfo{VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true}; } // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) { - tuple.format = VideoCore::Surface::IsPixelFormatSRGB(pixel_format) - ? VK_FORMAT_A8B8G8R8_SRGB_PACK32 - : VK_FORMAT_A8B8G8R8_UNORM_PACK32; + const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + if (is_srgb) { + tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32; + } else { + tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + tuple.usage |= Storage; + } } - const bool attachable = tuple.usage & Attachable; - const bool storage = tuple.usage & Storage; + const bool attachable = (tuple.usage & Attachable) != 0; + const bool storage = (tuple.usage & Storage) != 0; - VkFormatFeatureFlags usage; - if (format_type == FormatType::Buffer) { + VkFormatFeatureFlags usage{}; + switch (format_type) { + case FormatType::Buffer: usage = VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT | VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; - } else { + break; + case FormatType::Linear: + case FormatType::Optimal: usage = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT | VK_FORMAT_FEATURE_TRANSFER_SRC_BIT; if (attachable) { @@ -255,6 +261,7 @@ FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFo if (storage) { usage |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; } + break; } return {device.GetSupportedFormat(tuple.format, usage, format_type), attachable, storage}; } @@ -274,11 +281,11 @@ VkShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) { case Tegra::Engines::ShaderType::Compute: return VK_SHADER_STAGE_COMPUTE_BIT; } - UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage)); + UNIMPLEMENTED_MSG("Unimplemented shader stage={}", stage); return {}; } -VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, +VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const Device& device, Maxwell::PrimitiveTopology topology) { switch (topology) { case Maxwell::PrimitiveTopology::Points: @@ -298,9 +305,10 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; case Maxwell::PrimitiveTopology::Patches: return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; + default: + UNIMPLEMENTED_MSG("Unimplemented topology={}", topology); + return {}; } - UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); - return {}; } VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) { @@ -325,6 +333,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R16G16B16A16_UNORM; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::SignedNorm: @@ -347,6 +357,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R16G16B16A16_SNORM; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_SNORM_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::UnsignedScaled: @@ -369,6 +381,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R16G16B16A16_USCALED; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_USCALED_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::SignedScaled: @@ -391,6 +405,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R16G16B16A16_SSCALED; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_SSCALED_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::UnsignedInt: @@ -421,6 +437,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32A32_UINT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_UINT_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::SignedInt: @@ -451,6 +469,8 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32A32_SINT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_SINT_PACK32; + default: + break; } break; case Maxwell::VertexAttribute::Type::Float: @@ -471,11 +491,12 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_SFLOAT; + default: + break; } break; } - UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", static_cast<u32>(type), - static_cast<u32>(size)); + UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", type, size); return {}; } @@ -506,24 +527,20 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return VK_COMPARE_OP_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", comparison); return {}; } -VkIndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format) { +VkIndexType IndexFormat(Maxwell::IndexFormat index_format) { switch (index_format) { case Maxwell::IndexFormat::UnsignedByte: - if (!device.IsExtIndexTypeUint8Supported()) { - UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device"); - return VK_INDEX_TYPE_UINT16; - } return VK_INDEX_TYPE_UINT8_EXT; case Maxwell::IndexFormat::UnsignedShort: return VK_INDEX_TYPE_UINT16; case Maxwell::IndexFormat::UnsignedInt: return VK_INDEX_TYPE_UINT32; } - UNIMPLEMENTED_MSG("Unimplemented index_format={}", static_cast<u32>(index_format)); + UNIMPLEMENTED_MSG("Unimplemented index_format={}", index_format); return {}; } @@ -554,7 +571,7 @@ VkStencilOp StencilOp(Maxwell::StencilOp stencil_op) { case Maxwell::StencilOp::DecrWrapOGL: return VK_STENCIL_OP_DECREMENT_AND_WRAP; } - UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil_op)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", stencil_op); return {}; } @@ -576,7 +593,7 @@ VkBlendOp BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return VK_BLEND_OP_MAX; } - UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", equation); return {}; } @@ -640,7 +657,7 @@ VkBlendFactor BlendFactor(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA; } - UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", factor); return {}; } @@ -651,11 +668,11 @@ VkFrontFace FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return VK_FRONT_FACE_COUNTER_CLOCKWISE; } - UNIMPLEMENTED_MSG("Unimplemented front face={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face={}", front_face); return {}; } -VkCullModeFlags CullFace(Maxwell::CullFace cull_face) { +VkCullModeFlagBits CullFace(Maxwell::CullFace cull_face) { switch (cull_face) { case Maxwell::CullFace::Front: return VK_CULL_MODE_FRONT_BIT; @@ -664,7 +681,7 @@ VkCullModeFlags CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return VK_CULL_MODE_FRONT_AND_BACK; } - UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", cull_face); return {}; } @@ -684,7 +701,7 @@ VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle) { case Tegra::Texture::SwizzleSource::OneFloat: return VK_COMPONENT_SWIZZLE_ONE; } - UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(swizzle)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", swizzle); return {}; } @@ -707,8 +724,21 @@ VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) case Maxwell::ViewportSwizzle::NegativeW: return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV; } - UNREACHABLE_MSG("Invalid swizzle={}", static_cast<int>(swizzle)); + UNREACHABLE_MSG("Invalid swizzle={}", swizzle); return {}; } +VkSamplerReductionMode SamplerReduction(Tegra::Texture::SamplerReduction reduction) { + switch (reduction) { + case Tegra::Texture::SamplerReduction::WeightedAverage: + return VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT; + case Tegra::Texture::SamplerReduction::Min: + return VK_SAMPLER_REDUCTION_MODE_MIN_EXT; + case Tegra::Texture::SamplerReduction::Max: + return VK_SAMPLER_REDUCTION_MODE_MAX_EXT; + } + UNREACHABLE_MSG("Invalid sampler mode={}", static_cast<int>(reduction)); + return VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT; +} + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 7e213452f..e3e06ba38 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -6,10 +6,10 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" #include "video_core/textures/texture.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::MaxwellToVK { @@ -22,7 +22,7 @@ VkFilter Filter(Tegra::Texture::TextureFilter filter); VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter); -VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, +VkSamplerAddressMode WrapMode(const Device& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter); VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func); @@ -35,17 +35,25 @@ struct FormatInfo { bool storage; }; -FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format); +/** + * Returns format properties supported in the host + * @param device Host device + * @param format_type Type of image the buffer will use + * @param with_srgb True when the format can be sRGB when converted to another format (ASTC) + * @param pixel_format Guest pixel format to describe + */ +[[nodiscard]] FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with_srgb, + PixelFormat pixel_format); VkShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage); -VkPrimitiveTopology PrimitiveTopology(const VKDevice& device, Maxwell::PrimitiveTopology topology); +VkPrimitiveTopology PrimitiveTopology(const Device& device, Maxwell::PrimitiveTopology topology); VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size); VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison); -VkIndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format); +VkIndexType IndexFormat(Maxwell::IndexFormat index_format); VkStencilOp StencilOp(Maxwell::StencilOp stencil_op); @@ -55,10 +63,12 @@ VkBlendFactor BlendFactor(Maxwell::Blend::Factor factor); VkFrontFace FrontFace(Maxwell::FrontFace front_face); -VkCullModeFlags CullFace(Maxwell::CullFace cull_face); +VkCullModeFlagBits CullFace(Maxwell::CullFace cull_face); VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle); VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle); +VkSamplerReductionMode SamplerReduction(Tegra::Texture::SamplerReduction reduction); + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index ae46e0444..1cc720ddd 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,8 +12,6 @@ #include <fmt/format.h> -#include "common/dynamic_library.h" -#include "common/file_util.h" #include "common/logging/log.h" #include "common/telemetry.h" #include "core/core.h" @@ -24,179 +22,27 @@ #include "video_core/gpu.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" - -// Include these late to avoid polluting previous headers -#ifdef _WIN32 -#include <windows.h> -// ensure include order -#include <vulkan/vulkan_win32.h> -#endif - -#if !defined(_WIN32) && !defined(__APPLE__) -#include <X11/Xlib.h> -#include <vulkan/vulkan_wayland.h> -#include <vulkan/vulkan_xlib.h> -#endif +#include "video_core/vulkan_common/vulkan_debug_callback.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_library.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_surface.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - -using Core::Frontend::WindowSystemType; - -VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, - VkDebugUtilsMessageTypeFlagsEXT type, - const VkDebugUtilsMessengerCallbackDataEXT* data, - [[maybe_unused]] void* user_data) { - const char* message{data->pMessage}; - - if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { - LOG_CRITICAL(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { - LOG_WARNING(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) { - LOG_INFO(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT) { - LOG_DEBUG(Render_Vulkan, "{}", message); - } - return VK_FALSE; -} - -Common::DynamicLibrary OpenVulkanLibrary() { - Common::DynamicLibrary library; -#ifdef __APPLE__ - // Check if a path to a specific Vulkan library has been specified. - char* libvulkan_env = getenv("LIBVULKAN_PATH"); - if (!libvulkan_env || !library.Open(libvulkan_env)) { - // Use the libvulkan.dylib from the application bundle. - const std::string filename = - Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; - library.Open(filename.c_str()); - } -#else - std::string filename = Common::DynamicLibrary::GetVersionedFilename("vulkan", 1); - if (!library.Open(filename.c_str())) { - // Android devices may not have libvulkan.so.1, only libvulkan.so. - filename = Common::DynamicLibrary::GetVersionedFilename("vulkan"); - library.Open(filename.c_str()); - } -#endif - return library; -} - -vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatch& dld, - WindowSystemType window_type = WindowSystemType::Headless, - bool enable_layers = false) { - if (!library.IsOpen()) { - LOG_ERROR(Render_Vulkan, "Vulkan library not available"); - return {}; - } - if (!library.GetSymbol("vkGetInstanceProcAddr", &dld.vkGetInstanceProcAddr)) { - LOG_ERROR(Render_Vulkan, "vkGetInstanceProcAddr not present in Vulkan"); - return {}; - } - if (!vk::Load(dld)) { - LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); - return {}; - } - - std::vector<const char*> extensions; - extensions.reserve(6); - switch (window_type) { - case Core::Frontend::WindowSystemType::Headless: - break; -#ifdef _WIN32 - case Core::Frontend::WindowSystemType::Windows: - extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME); - break; -#endif -#if !defined(_WIN32) && !defined(__APPLE__) - case Core::Frontend::WindowSystemType::X11: - extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME); - break; - case Core::Frontend::WindowSystemType::Wayland: - extensions.push_back(VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME); - break; -#endif - default: - LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); - break; - } - if (window_type != Core::Frontend::WindowSystemType::Headless) { - extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); - } - if (enable_layers) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); - } - extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - if (!properties) { - LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); - return {}; - } - - for (const char* extension : extensions) { - const auto it = - std::find_if(properties->begin(), properties->end(), [extension](const auto& prop) { - return !std::strcmp(extension, prop.extensionName); - }); - if (it == properties->end()) { - LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); - return {}; - } - } - - std::vector<const char*> layers; - layers.reserve(1); - if (enable_layers) { - layers.push_back("VK_LAYER_KHRONOS_validation"); - } - - const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); - if (!layer_properties) { - LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); - layers.clear(); - } - - for (auto layer_it = layers.begin(); layer_it != layers.end();) { - const char* const layer = *layer_it; - const auto it = std::find_if( - layer_properties->begin(), layer_properties->end(), - [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); }); - if (it == layer_properties->end()) { - LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); - layer_it = layers.erase(layer_it); - } else { - ++layer_it; - } - } - - vk::Instance instance = vk::Instance::Create(layers, extensions, dld); - if (!instance) { - LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance"); - return {}; - } - if (!vk::Load(*instance, dld)) { - LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); - } - return instance; -} - std::string GetReadableVersion(u32 version) { return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version), VK_VERSION_PATCH(version)); } -std::string GetDriverVersion(const VKDevice& device) { +std::string GetDriverVersion(const Device& device) { // Extracted from // https://github.com/SaschaWillems/vulkan.gpuinfo.org/blob/5dddea46ea1120b0df14eef8f15ff8e318e35462/functions.php#L308-L314 const u32 version = device.GetDriverVersion(); @@ -213,7 +59,6 @@ std::string GetDriverVersion(const VKDevice& device) { const u32 minor = version & 0x3fff; return fmt::format("{}.{}", major, minor); } - return GetReadableVersion(version); } @@ -235,213 +80,98 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext return separated_extensions; } +Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, + VkSurfaceKHR surface) { + const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices(); + const s32 device_index = Settings::values.vulkan_device.GetValue(); + if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) { + LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + const vk::PhysicalDevice physical_device(devices[device_index], dld); + return Device(*instance, physical_device, surface, dld); +} } // Anonymous namespace -RendererVulkan::RendererVulkan(Core::System& system_, Core::Frontend::EmuWindow& emu_window, - Tegra::GPU& gpu_, - std::unique_ptr<Core::Frontend::GraphicsContext> context) - : RendererBase{emu_window, std::move(context)}, system{system_}, gpu{gpu_} {} +RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_) try + : RendererBase(emu_window, std::move(context_)), + telemetry_session(telemetry_session_), + cpu_memory(cpu_memory_), + gpu(gpu_), + library(OpenLibrary()), + instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, + true, Settings::values.renderer_debug)), + debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), + surface(CreateSurface(instance, render_window)), + device(CreateDevice(instance, dld, *surface)), + memory_allocator(device, false), + state_tracker(gpu), + scheduler(device, state_tracker), + swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, + render_window.GetFramebufferLayout().height, false), + blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler, + screen_info), + rasterizer(render_window, gpu, gpu.MemoryManager(), cpu_memory, screen_info, device, + memory_allocator, state_tracker, scheduler) { + Report(); +} catch (const vk::Exception& exception) { + LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what()); + throw std::runtime_error{fmt::format("Vulkan initialization error {}", exception.what())}; +} RendererVulkan::~RendererVulkan() { - ShutDown(); + void(device.GetLogical().WaitIdle()); } void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - render_window.PollEvents(); - if (!framebuffer) { return; } - const auto& layout = render_window.GetFramebufferLayout(); if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) { const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const bool use_accelerated = - rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); + rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); const bool is_srgb = use_accelerated && screen_info.is_srgb; - if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) { - swapchain->Create(layout.width, layout.height, is_srgb); - blit_screen->Recreate(); - } - - scheduler->WaitWorker(); - - swapchain->AcquireNextImage(); - const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated); - - scheduler->Flush(false, render_semaphore); - - if (swapchain->Present(render_semaphore, fence)) { - blit_screen->Recreate(); + if (swapchain.HasFramebufferChanged(layout) || swapchain.GetSrgbState() != is_srgb) { + swapchain.Create(layout.width, layout.height, is_srgb); + blit_screen.Recreate(); } - rasterizer->TickFrame(); - } - - render_window.PollEvents(); -} - -bool RendererVulkan::TryPresent(int /*timeout_ms*/) { - // TODO (bunnei): ImplementMe - return true; -} - -bool RendererVulkan::Init() { - library = OpenVulkanLibrary(); - instance = CreateInstance(library, dld, render_window.GetWindowInfo().type, - Settings::values.renderer_debug); - if (!instance || !CreateDebugCallback() || !CreateSurface() || !PickDevices()) { - return false; - } - - Report(); - - memory_manager = std::make_unique<VKMemoryManager>(*device); - - resource_manager = std::make_unique<VKResourceManager>(*device); - - const auto& framebuffer = render_window.GetFramebufferLayout(); - swapchain = std::make_unique<VKSwapchain>(*surface, *device); - swapchain->Create(framebuffer.width, framebuffer.height, false); - - state_tracker = std::make_unique<StateTracker>(system); - - scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker); - - rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device, - *resource_manager, *memory_manager, - *state_tracker, *scheduler); - - blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device, - *resource_manager, *memory_manager, *swapchain, - *scheduler, screen_info); - - return true; -} - -void RendererVulkan::ShutDown() { - if (!device) { - return; - } - if (const auto& dev = device->GetLogical()) { - dev.WaitIdle(); - } - - rasterizer.reset(); - blit_screen.reset(); - scheduler.reset(); - swapchain.reset(); - memory_manager.reset(); - resource_manager.reset(); - device.reset(); -} + scheduler.WaitWorker(); -bool RendererVulkan::CreateDebugCallback() { - if (!Settings::values.renderer_debug) { - return true; - } - debug_callback = instance.TryCreateDebugCallback(DebugCallback); - if (!debug_callback) { - LOG_ERROR(Render_Vulkan, "Failed to create debug callback"); - return false; - } - return true; -} + swapchain.AcquireNextImage(); + const VkSemaphore render_semaphore = blit_screen.Draw(*framebuffer, use_accelerated); -bool RendererVulkan::CreateSurface() { - [[maybe_unused]] const auto& window_info = render_window.GetWindowInfo(); - VkSurfaceKHR unsafe_surface = nullptr; + scheduler.Flush(render_semaphore); -#ifdef _WIN32 - if (window_info.type == Core::Frontend::WindowSystemType::Windows) { - const HWND hWnd = static_cast<HWND>(window_info.render_surface); - const VkWin32SurfaceCreateInfoKHR win32_ci{VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR, - nullptr, 0, nullptr, hWnd}; - const auto vkCreateWin32SurfaceKHR = reinterpret_cast<PFN_vkCreateWin32SurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateWin32SurfaceKHR")); - if (!vkCreateWin32SurfaceKHR || - vkCreateWin32SurfaceKHR(*instance, &win32_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Win32 surface"); - return false; + if (swapchain.Present(render_semaphore)) { + blit_screen.Recreate(); } - } -#endif -#if !defined(_WIN32) && !defined(__APPLE__) - if (window_info.type == Core::Frontend::WindowSystemType::X11) { - const VkXlibSurfaceCreateInfoKHR xlib_ci{ - VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0, - static_cast<Display*>(window_info.display_connection), - reinterpret_cast<Window>(window_info.render_surface)}; - const auto vkCreateXlibSurfaceKHR = reinterpret_cast<PFN_vkCreateXlibSurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateXlibSurfaceKHR")); - if (!vkCreateXlibSurfaceKHR || - vkCreateXlibSurfaceKHR(*instance, &xlib_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Xlib surface"); - return false; - } - } - if (window_info.type == Core::Frontend::WindowSystemType::Wayland) { - const VkWaylandSurfaceCreateInfoKHR wayland_ci{ - VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR, nullptr, 0, - static_cast<wl_display*>(window_info.display_connection), - static_cast<wl_surface*>(window_info.render_surface)}; - const auto vkCreateWaylandSurfaceKHR = reinterpret_cast<PFN_vkCreateWaylandSurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateWaylandSurfaceKHR")); - if (!vkCreateWaylandSurfaceKHR || - vkCreateWaylandSurfaceKHR(*instance, &wayland_ci, nullptr, &unsafe_surface) != - VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Wayland surface"); - return false; - } - } -#endif - if (!unsafe_surface) { - LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); - return false; + rasterizer.TickFrame(); } - surface = vk::SurfaceKHR(unsafe_surface, *instance, dld); - return true; -} - -bool RendererVulkan::PickDevices() { - const auto devices = instance.EnumeratePhysicalDevices(); - if (!devices) { - LOG_ERROR(Render_Vulkan, "Failed to enumerate physical devices"); - return false; - } - - const s32 device_index = Settings::values.vulkan_device.GetValue(); - if (device_index < 0 || device_index >= static_cast<s32>(devices->size())) { - LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); - return false; - } - const vk::PhysicalDevice physical_device((*devices)[static_cast<std::size_t>(device_index)], - dld); - if (!VKDevice::IsSuitable(physical_device, *surface)) { - return false; - } - - device = std::make_unique<VKDevice>(*instance, physical_device, *surface, dld); - return device->Create(); + render_window.OnFrameDisplayed(); } void RendererVulkan::Report() const { - const std::string vendor_name{device->GetVendorName()}; - const std::string model_name{device->GetModelName()}; - const std::string driver_version = GetDriverVersion(*device); + const std::string vendor_name{device.GetVendorName()}; + const std::string model_name{device.GetModelName()}; + const std::string driver_version = GetDriverVersion(device); const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); - const std::string api_version = GetReadableVersion(device->GetApiVersion()); + const std::string api_version = GetReadableVersion(device.ApiVersion()); - const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); + const std::string extensions = BuildCommaSeparatedExtensions(device.GetAvailableExtensions()); LOG_INFO(Render_Vulkan, "Driver: {}", driver_name); LOG_INFO(Render_Vulkan, "Device: {}", model_name); LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version); - auto& telemetry_session = system.TelemetrySession(); - constexpr auto field = Common::Telemetry::FieldType::UserSystem; + static constexpr auto field = Common::Telemetry::FieldType::UserSystem; telemetry_session.AddField(field, "GPU_Vendor", vendor_name); telemetry_session.AddField(field, "GPU_Model", model_name); telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name); @@ -449,25 +179,4 @@ void RendererVulkan::Report() const { telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); } -std::vector<std::string> RendererVulkan::EnumerateDevices() { - vk::InstanceDispatch dld; - Common::DynamicLibrary library = OpenVulkanLibrary(); - vk::Instance instance = CreateInstance(library, dld); - if (!instance) { - return {}; - } - - const std::optional physical_devices = instance.EnumeratePhysicalDevices(); - if (!physical_devices) { - return {}; - } - - std::vector<std::string> names; - names.reserve(physical_devices->size()); - for (const auto& device : *physical_devices) { - names.push_back(vk::PhysicalDevice(device, dld).GetProperties().deviceName); - } - return names; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 13debbbc0..72071316c 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -9,75 +9,67 @@ #include <vector> #include "common/dynamic_library.h" - #include "video_core/renderer_base.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { -class System; +class TelemetrySession; } -namespace Vulkan { +namespace Core::Memory { +class Memory; +} -class StateTracker; -class VKBlitScreen; -class VKDevice; -class VKFence; -class VKMemoryManager; -class VKResourceManager; -class VKSwapchain; -class VKScheduler; -class VKImage; - -struct VKScreenInfo { - VKImage* image{}; - u32 width{}; - u32 height{}; - bool is_srgb{}; -}; +namespace Tegra { +class GPU; +} + +namespace Vulkan { class RendererVulkan final : public VideoCore::RendererBase { public: - explicit RendererVulkan(Core::System& system, Core::Frontend::EmuWindow& emu_window, - Tegra::GPU& gpu, - std::unique_ptr<Core::Frontend::GraphicsContext> context); + explicit RendererVulkan(Core::TelemetrySession& telemtry_session, + Core::Frontend::EmuWindow& emu_window, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_); ~RendererVulkan() override; - bool Init() override; - void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - bool TryPresent(int timeout_ms) override; - static std::vector<std::string> EnumerateDevices(); + VideoCore::RasterizerInterface* ReadRasterizer() override { + return &rasterizer; + } private: - bool CreateDebugCallback(); - - bool CreateSurface(); - - bool PickDevices(); - void Report() const; - Core::System& system; + Core::TelemetrySession& telemetry_session; + Core::Memory::Memory& cpu_memory; Tegra::GPU& gpu; Common::DynamicLibrary library; vk::InstanceDispatch dld; vk::Instance instance; + vk::DebugUtilsMessenger debug_callback; vk::SurfaceKHR surface; VKScreenInfo screen_info; - vk::DebugCallback debug_callback; - std::unique_ptr<VKDevice> device; - std::unique_ptr<VKSwapchain> swapchain; - std::unique_ptr<VKMemoryManager> memory_manager; - std::unique_ptr<VKResourceManager> resource_manager; - std::unique_ptr<StateTracker> state_tracker; - std::unique_ptr<VKScheduler> scheduler; - std::unique_ptr<VKBlitScreen> blit_screen; + Device device; + MemoryAllocator memory_allocator; + StateTracker state_tracker; + VKScheduler scheduler; + VKSwapchain swapchain; + VKBlitScreen blit_screen; + RasterizerVulkan rasterizer; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/shaders/quad_array.comp b/src/video_core/renderer_vulkan/shaders/quad_array.comp deleted file mode 100644 index 5a5703308..000000000 --- a/src/video_core/renderer_vulkan/shaders/quad_array.comp +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - -#version 460 core - -layout (local_size_x = 1024) in; - -layout (std430, set = 0, binding = 0) buffer OutputBuffer { - uint output_indexes[]; -}; - -layout (push_constant) uniform PushConstants { - uint first; -}; - -void main() { - uint primitive = gl_GlobalInvocationID.x; - if (primitive * 6 >= output_indexes.length()) { - return; - } - - const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3); - for (uint vertex = 0; vertex < 6; ++vertex) { - uint index = first + primitive * 4 + quad_map[vertex]; - output_indexes[primitive * 6 + vertex] = index; - } -} diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index a551e3de8..a1a32aabe 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -12,127 +12,28 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/math_util.h" - #include "core/core.h" #include "core/frontend/emu_window.h" #include "core/memory.h" - #include "video_core/gpu.h" -#include "video_core/morton.h" -#include "video_core/rasterizer_interface.h" +#include "video_core/host_shaders/vulkan_present_frag_spv.h" +#include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_image.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" +#include "video_core/textures/decoders.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { namespace { -// Generated from the "shaders/" directory, read the instructions there. -constexpr u8 blit_vertex_code[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x27, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x04, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x20, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x1a, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x50, 0x00, 0x07, 0x00, 0x07, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x91, 0x00, 0x05, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, - 0x38, 0x00, 0x01, 0x00}; - -constexpr u8 blit_fragment_code[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x07, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x57, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - struct ScreenRectVertex { ScreenRectVertex() = default; explicit ScreenRectVertex(f32 x, f32 y, f32 u, f32 v) : position{{x, y}}, tex_coord{{u, v}} {} @@ -175,9 +76,9 @@ constexpr std::array<f32, 4 * 4> MakeOrthographicMatrix(f32 width, f32 height) { // clang-format on } -std::size_t GetBytesPerPixel(const Tegra::FramebufferConfig& framebuffer) { +u32 GetBytesPerPixel(const Tegra::FramebufferConfig& framebuffer) { using namespace VideoCore::Surface; - return GetBytesPerPixel(PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)); + return BytesPerBlock(PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)); } std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) { @@ -210,17 +111,14 @@ struct VKBlitScreen::BufferData { // Unaligned image data goes here }; -VKBlitScreen::VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, - VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKSwapchain& swapchain, VKScheduler& scheduler, - const VKScreenInfo& screen_info) - : system{system}, render_window{render_window}, rasterizer{rasterizer}, device{device}, - resource_manager{resource_manager}, memory_manager{memory_manager}, swapchain{swapchain}, - scheduler{scheduler}, image_count{swapchain.GetImageCount()}, screen_info{screen_info} { - watches.resize(image_count); - std::generate(watches.begin(), watches.end(), - []() { return std::make_unique<VKFenceWatch>(); }); +VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_, + Core::Frontend::EmuWindow& render_window_, const Device& device_, + MemoryAllocator& memory_allocator_, VKSwapchain& swapchain_, + VKScheduler& scheduler_, const VKScreenInfo& screen_info_) + : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_}, + memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_}, + image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { + resource_ticks.resize(image_count); CreateStaticResources(); CreateDynamicResources(); @@ -232,44 +130,40 @@ void VKBlitScreen::Recreate() { CreateDynamicResources(); } -std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated) { +VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated) { RefreshResources(framebuffer); // Finish any pending renderpass scheduler.RequestOutsideRenderPassOperationContext(); const std::size_t image_index = swapchain.GetImageIndex(); - watches[image_index]->Watch(scheduler.GetFence()); - VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get(); + scheduler.Wait(resource_ticks[image_index]); + resource_ticks[image_index] = scheduler.CurrentTick(); - UpdateDescriptorSet(image_index, blit_image->GetPresentView()); + UpdateDescriptorSet(image_index, + use_accelerated ? screen_info.image_view : *raw_image_views[image_index]); BufferData data; SetUniformData(data, framebuffer); SetVertexData(data, framebuffer); - auto map = buffer_commit->Map(); - std::memcpy(map.GetAddress(), &data, sizeof(data)); + const std::span<u8> mapped_span = buffer_commit.Map(); + std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { const u64 image_offset = GetRawImageOffset(framebuffer, image_index); - const auto pixel_format = - VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format); const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; - const auto host_ptr = system.Memory().GetPointer(framebuffer_addr); - rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer)); + const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); + const size_t size_bytes = GetSizeInBytes(framebuffer); // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; - VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, - framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, - map.GetAddress() + image_offset, host_ptr); - - blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); + Tegra::Texture::UnswizzleTexture( + mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), + bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); const VkBufferImageCopy copy{ .bufferOffset = image_offset, @@ -291,15 +185,42 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon }, }; scheduler.Record( - [buffer = *buffer, image = *blit_image->GetHandle(), copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); + [buffer = *buffer, image = *raw_images[image_index], copy](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier base_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = 0, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + VkImageMemoryBarrier read_barrier = base_barrier; + read_barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + read_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + read_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImageMemoryBarrier write_barrier = base_barrier; + write_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + write_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier); }); } - map.Release(); - - blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - scheduler.Record([renderpass = *renderpass, framebuffer = *framebuffers[image_index], descriptor_set = descriptor_sets[image_index], buffer = *buffer, size = swapchain.GetSize(), pipeline = *pipeline, @@ -307,31 +228,31 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon const VkClearValue clear_color{ .color = {.float32 = {0.0f, 0.0f, 0.0f, 0.0f}}, }; - - VkRenderPassBeginInfo renderpass_bi; - renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderpass_bi.pNext = nullptr; - renderpass_bi.renderPass = renderpass; - renderpass_bi.framebuffer = framebuffer; - renderpass_bi.renderArea.offset.x = 0; - renderpass_bi.renderArea.offset.y = 0; - renderpass_bi.renderArea.extent = size; - renderpass_bi.clearValueCount = 1; - renderpass_bi.pClearValues = &clear_color; - - VkViewport viewport; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = static_cast<float>(size.width); - viewport.height = static_cast<float>(size.height); - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor; - scissor.offset.x = 0; - scissor.offset.y = 0; - scissor.extent = size; - + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = renderpass, + .framebuffer = framebuffer, + .renderArea = + { + .offset = {0, 0}, + .extent = size, + }, + .clearValueCount = 1, + .pClearValues = &clear_color, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(size.width), + .height = static_cast<float>(size.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + const VkRect2D scissor{ + .offset = {0, 0}, + .extent = size, + }; cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); cmdbuf.SetViewport(0, viewport); @@ -342,8 +263,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon cmdbuf.Draw(4, 1, 0, 0); cmdbuf.EndRenderPass(); }); - - return {scheduler.GetFence(), *semaphores[image_index]}; + return *semaphores[image_index]; } void VKBlitScreen::CreateStaticResources() { @@ -375,8 +295,8 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) } void VKBlitScreen::CreateShaders() { - vertex_shader = BuildShader(device, sizeof(blit_vertex_code), blit_vertex_code); - fragment_shader = BuildShader(device, sizeof(blit_fragment_code), blit_fragment_code); + vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); + fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); } void VKBlitScreen::CreateSemaphores() { @@ -423,7 +343,7 @@ void VKBlitScreen::CreateRenderPass() { const VkAttachmentReference color_attachment_ref{ .attachment = 0, - .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .layout = VK_IMAGE_LAYOUT_GENERAL, }; const VkSubpassDescription subpass_description{ @@ -711,12 +631,12 @@ void VKBlitScreen::CreateFramebuffers() { void VKBlitScreen::ReleaseRawImages() { for (std::size_t i = 0; i < raw_images.size(); ++i) { - watches[i]->Wait(); + scheduler.Wait(resource_ticks.at(i)); } raw_images.clear(); raw_buffer_commits.clear(); buffer.reset(); - buffer_commit.reset(); + buffer_commit = MemoryCommit{}; } void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) { @@ -733,39 +653,61 @@ void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuff }; buffer = device.GetLogical().CreateBuffer(ci); - buffer_commit = memory_manager.Commit(buffer, true); + buffer_commit = memory_allocator.Commit(buffer, MemoryUsage::Upload); } void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { raw_images.resize(image_count); + raw_image_views.resize(image_count); raw_buffer_commits.resize(image_count); - const VkImageCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .imageType = VK_IMAGE_TYPE_2D, - .format = GetFormat(framebuffer), - .extent = - { - .width = framebuffer.width, - .height = framebuffer.height, - .depth = 1, - }, - .mipLevels = 1, - .arrayLayers = 1, - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_LINEAR, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - }; - - for (std::size_t i = 0; i < image_count; ++i) { - raw_images[i] = std::make_unique<VKImage>(device, scheduler, ci, VK_IMAGE_ASPECT_COLOR_BIT); - raw_buffer_commits[i] = memory_manager.Commit(raw_images[i]->GetHandle(), false); + for (size_t i = 0; i < image_count; ++i) { + raw_images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = GetFormat(framebuffer), + .extent = + { + .width = framebuffer.width, + .height = framebuffer.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }); + raw_buffer_commits[i] = memory_allocator.Commit(raw_images[i], MemoryUsage::DeviceLocal); + raw_image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *raw_images[i], + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = GetFormat(framebuffer), + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); } } @@ -792,7 +734,7 @@ void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView imag const VkDescriptorImageInfo image_info{ .sampler = *sampler, .imageView = image_view, - .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkWriteDescriptorSet sampler_write{ diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 243640fab..5e3177685 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -5,16 +5,18 @@ #pragma once #include <memory> -#include <tuple> -#include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; } +namespace Core::Memory { +class Memory; +} + namespace Core::Frontend { class EmuWindow; } @@ -30,26 +32,31 @@ class RasterizerInterface; namespace Vulkan { struct ScreenInfo; + +class Device; class RasterizerVulkan; -class VKDevice; -class VKFence; -class VKImage; class VKScheduler; class VKSwapchain; -class VKBlitScreen final { +struct VKScreenInfo { + VkImageView image_view{}; + u32 width{}; + u32 height{}; + bool is_srgb{}; +}; + +class VKBlitScreen { public: - explicit VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, - VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKSwapchain& swapchain, VKScheduler& scheduler, - const VKScreenInfo& screen_info); + explicit VKBlitScreen(Core::Memory::Memory& cpu_memory, + Core::Frontend::EmuWindow& render_window, const Device& device, + MemoryAllocator& memory_manager, VKSwapchain& swapchain, + VKScheduler& scheduler, const VKScreenInfo& screen_info); ~VKBlitScreen(); void Recreate(); - std::tuple<VKFence&, VkSemaphore> Draw(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated); + [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated); private: struct BufferData; @@ -81,12 +88,10 @@ private: u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, std::size_t image_index) const; - Core::System& system; + Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; - VideoCore::RasterizerInterface& rasterizer; - const VKDevice& device; - VKResourceManager& resource_manager; - VKMemoryManager& memory_manager; + const Device& device; + MemoryAllocator& memory_allocator; VKSwapchain& swapchain; VKScheduler& scheduler; const std::size_t image_count; @@ -104,13 +109,14 @@ private: vk::Sampler sampler; vk::Buffer buffer; - VKMemoryCommit buffer_commit; + MemoryCommit buffer_commit; - std::vector<std::unique_ptr<VKFenceWatch>> watches; + std::vector<u64> resource_ticks; std::vector<vk::Semaphore> semaphores; - std::vector<std::unique_ptr<VKImage>> raw_images; - std::vector<VKMemoryCommit> raw_buffer_commits; + std::vector<vk::Image> raw_images; + std::vector<vk::ImageView> raw_image_views; + std::vector<MemoryCommit> raw_buffer_commits; u32 raw_width = 0; u32 raw_height = 0; }; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 1d2f8b557..848eedd66 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -3,172 +3,308 @@ // Refer to the license.txt file included. #include <algorithm> +#include <array> #include <cstring> -#include <memory> +#include <span> +#include <vector> -#include "core/core.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { +VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) { + return VkBufferCopy{ + .srcOffset = copy.src_offset, + .dstOffset = copy.dst_offset, + .size = copy.size, + }; +} -constexpr VkBufferUsageFlags BUFFER_USAGE = - VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - -constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE = - VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - -constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS = - VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT; +VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) { + if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) { + return VK_INDEX_TYPE_UINT8_EXT; + } + if (num_elements <= 0xffff) { + return VK_INDEX_TYPE_UINT16; + } + return VK_INDEX_TYPE_UINT32; +} -std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKScheduler& scheduler) { - return std::make_unique<VKStreamBuffer>(device, scheduler, BUFFER_USAGE); +size_t BytesPerIndex(VkIndexType index_type) { + switch (index_type) { + case VK_INDEX_TYPE_UINT8_EXT: + return 1; + case VK_INDEX_TYPE_UINT16: + return 2; + case VK_INDEX_TYPE_UINT32: + return 4; + default: + UNREACHABLE_MSG("Invalid index type={}", index_type); + return 1; + } } +template <typename T> +std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) { + std::array<T, 6> indices{0, 1, 2, 0, 2, 3}; + std::ranges::transform(indices, indices.begin(), + [quad, first](u32 index) { return first + index + quad * 4; }); + return indices; +} } // Anonymous namespace -Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, - VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) - : BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} { - const VkBufferCreateInfo ci{ +Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {} + +Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) { + buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = static_cast<VkDeviceSize>(size), - .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .size = SizeBytes(), + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }; + }); + if (runtime.device.HasDebuggingToolAttached()) { + buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str()); + } + commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); +} - buffer.handle = device.GetLogical().CreateBuffer(ci); - buffer.commit = memory_manager.Commit(buffer.handle, false); +BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_, + VKScheduler& scheduler_, StagingBufferPool& staging_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + VKDescriptorPool& descriptor_pool) + : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, + staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_}, + uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), + quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {} + +StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_pool.Request(size, MemoryUsage::Upload); } -Buffer::~Buffer() = default; +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_pool.Request(size, MemoryUsage::Download); +} -void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { - const auto& staging = staging_pool.GetUnusedBuffer(size, true); - std::memcpy(staging.commit->Map(size), data, size); +void BufferCacheRuntime::Finish() { + scheduler.Finish(); +} +void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, + std::span<const VideoCommon::BufferCopy> copies) { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + // Measuring a popular game, this number never exceeds the specified size once data is warmed up + boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size()); + std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, READ_BARRIER); + cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, WRITE_BARRIER); + }); +} - const VkBuffer handle = Handle(); - scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size}); - - const VkBufferMemoryBarrier barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = UPLOAD_ACCESS_BARRIERS, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = handle, - .offset = offset, - .size = size, - }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, - barrier, {}); +void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, + u32 base_vertex, u32 num_indices, VkBuffer buffer, + u32 offset, [[maybe_unused]] u32 size) { + VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format); + VkDeviceSize vk_offset = offset; + VkBuffer vk_buffer = buffer; + if (topology == PrimitiveTopology::Quads) { + vk_index_type = VK_INDEX_TYPE_UINT32; + std::tie(vk_buffer, vk_offset) = + quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset); + } else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { + vk_index_type = VK_INDEX_TYPE_UINT16; + std::tie(vk_buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset); + } + if (vk_buffer == VK_NULL_HANDLE) { + // Vulkan doesn't support null index buffers. Replace it with our own null buffer. + ReserveNullIndexBuffer(); + vk_buffer = *null_index_buffer; + } + scheduler.Record([vk_buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(vk_buffer, vk_offset, vk_index_type); }); } -void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { - const auto& staging = staging_pool.GetUnusedBuffer(size, true); - scheduler.RequestOutsideRenderPassOperationContext(); +void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) { + ReserveQuadArrayLUT(first + count, true); - const VkBuffer handle = Handle(); - scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { - const VkBufferMemoryBarrier barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = handle, - .offset = offset, - .size = size, - }; - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size}); + // The LUT has the indices 0, 1, 2, and 3 copied as an array + // To apply these 'first' offsets we can apply an offset based on the modulus. + const VkIndexType index_type = quad_array_lut_index_type; + const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4); + const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type); + scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(buffer, offset, index_type); }); - scheduler.Finish(); - - std::memcpy(data, staging.commit->Map(size), size); } -void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size) { - scheduler.RequestOutsideRenderPassOperationContext(); +void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, + u32 stride) { + if (device.IsExtExtendedDynamicStateSupported()) { + scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) { + const VkDeviceSize vk_offset = offset; + const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE; + const VkDeviceSize vk_stride = stride; + cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride); + }); + } else { + scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) { + cmdbuf.BindVertexBuffer(index, buffer, offset); + }); + } +} - const VkBuffer dst_buffer = Handle(); - scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, - size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); - - std::array<VkBufferMemoryBarrier, 2> barriers; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = nullptr; - barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src_buffer; - barriers[0].offset = src_offset; - barriers[0].size = size; - barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[1].pNext = nullptr; - barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS; - barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[1].buffer = dst_buffer; - barriers[1].offset = dst_offset; - barriers[1].size = size; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, - barriers, {}); +void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, + u32 size) { + if (!device.IsExtTransformFeedbackSupported()) { + // Already logged in the rasterizer + return; + } + scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) { + const VkDeviceSize vk_offset = offset; + const VkDeviceSize vk_size = size; + cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size); }); } -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool) - : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, - CreateStreamBuffer(device, - scheduler)}, - device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ - staging_pool} {} +void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) { + update_descriptor_queue.AddBuffer(buffer, offset, size); +} + +void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) { + if (num_indices <= current_num_indices) { + return; + } + if (wait_for_idle) { + scheduler.Finish(); + } + current_num_indices = num_indices; + quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices); -VKBufferCache::~VKBufferCache() = default; + const u32 num_quads = num_indices / 4; + const u32 num_triangle_indices = num_quads * 6; + const u32 num_first_offset_copies = 4; + const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type); + const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies; + quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = size_bytes, + .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + if (device.HasDebuggingToolAttached()) { + quad_array_lut.SetObjectNameEXT("Quad LUT"); + } + quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal); -std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr, - size); + const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload); + u8* staging_data = staging.mapped_span.data(); + const size_t quad_size = bytes_per_index * 6; + for (u32 first = 0; first < num_first_offset_copies; ++first) { + for (u32 quad = 0; quad < num_quads; ++quad) { + switch (quad_array_lut_index_type) { + case VK_INDEX_TYPE_UINT8_EXT: + std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size); + break; + case VK_INDEX_TYPE_UINT16: + std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size); + break; + case VK_INDEX_TYPE_UINT32: + std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size); + break; + default: + UNREACHABLE(); + break; + } + staging_data += quad_size; + } + } + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, + dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) { + const VkBufferCopy copy{ + .srcOffset = src_offset, + .dstOffset = 0, + .size = size_bytes, + }; + const VkBufferMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = dst_buffer, + .offset = 0, + .size = size_bytes, + }; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, + 0, write_barrier); + }); } -VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { - size = std::max(size, std::size_t(4)); - const auto& empty = staging_pool.GetUnusedBuffer(size, false); +void BufferCacheRuntime::ReserveNullIndexBuffer() { + if (null_index_buffer) { + return; + } + null_index_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 4, + .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + if (device.HasDebuggingToolAttached()) { + null_index_buffer.SetObjectNameEXT("Null index buffer"); + } + null_index_buffer_commit = memory_allocator.Commit(null_index_buffer, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, size, 0); + scheduler.Record([buffer = *null_index_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, VK_WHOLE_SIZE, 0); }); - return {*empty.handle, 0, 0}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 991ee451c..041e6515c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -4,70 +4,124 @@ #pragma once -#include <memory> - -#include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Core { -class System; -} +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; -class VKMemoryManager; +class Device; +class VKDescriptorPool; class VKScheduler; +class VKUpdateDescriptorQueue; -class Buffer final : public VideoCommon::BufferBlock { +class BufferCacheRuntime; + +class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> { public: - explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size); - ~Buffer(); + explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params); + explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_); + + [[nodiscard]] VkBuffer Handle() const noexcept { + return *buffer; + } + + operator VkBuffer() const noexcept { + return *buffer; + } + +private: + vk::Buffer buffer; + MemoryCommit commit; +}; + +class BufferCacheRuntime { + friend Buffer; + + using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology; + using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat; + +public: + explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_, + VKScheduler& scheduler_, StagingBufferPool& staging_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + VKDescriptorPool& descriptor_pool); + + void Finish(); - void Upload(std::size_t offset, std::size_t size, const u8* data); + [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - void Download(std::size_t offset, std::size_t size, u8* data); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); - void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size); + void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, + std::span<const VideoCommon::BufferCopy> copies); - VkBuffer Handle() const { - return *buffer.handle; + void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices, + u32 base_vertex, VkBuffer buffer, u32 offset, u32 size); + + void BindQuadArrayIndexBuffer(u32 first, u32 count); + + void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride); + + void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size); + + std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t stage, + [[maybe_unused]] u32 binding_index, u32 size) { + const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload); + BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size); + return ref.mapped_span; } - u64 Address() const { - return 0; + void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) { + BindBuffer(buffer, offset, size); + } + + void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size, + [[maybe_unused]] bool is_written) { + BindBuffer(buffer, offset, size); } private: + void BindBuffer(VkBuffer buffer, u32 offset, u32 size); + + void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle); + + void ReserveNullIndexBuffer(); + + const Device& device; + MemoryAllocator& memory_allocator; VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + StagingBufferPool& staging_pool; + VKUpdateDescriptorQueue& update_descriptor_queue; - VKBuffer buffer; -}; + vk::Buffer quad_array_lut; + MemoryCommit quad_array_lut_commit; + VkIndexType quad_array_lut_index_type{}; + u32 current_num_indices = 0; -class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { -public: - explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool); - ~VKBufferCache(); + vk::Buffer null_index_buffer; + MemoryCommit null_index_buffer_commit; - BufferInfo GetEmptyBuffer(std::size_t size) override; + Uint8Pass uint8_pass; + QuadIndexedPass quad_index_pass; +}; -protected: - std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; +struct BufferCacheParams { + using Runtime = Vulkan::BufferCacheRuntime; + using Buffer = Vulkan::Buffer; -private: - const VKDevice& device; - VKMemoryManager& memory_manager; - VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + static constexpr bool IS_OPENGL = false; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; + static constexpr bool USE_MEMORY_MAPS = true; }; +using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp new file mode 100644 index 000000000..a99df9323 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp @@ -0,0 +1,46 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstddef> + +#include "video_core/renderer_vulkan/vk_command_pool.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +constexpr size_t COMMAND_BUFFER_POOL_SIZE = 0x1000; + +struct CommandPool::Pool { + vk::CommandPool handle; + vk::CommandBuffers cmdbufs; +}; + +CommandPool::CommandPool(MasterSemaphore& master_semaphore_, const Device& device_) + : ResourcePool(master_semaphore_, COMMAND_BUFFER_POOL_SIZE), device{device_} {} + +CommandPool::~CommandPool() = default; + +void CommandPool::Allocate(size_t begin, size_t end) { + // Command buffers are going to be commited, recorded, executed every single usage cycle. + // They are also going to be reseted when commited. + Pool& pool = pools.emplace_back(); + pool.handle = device.GetLogical().CreateCommandPool({ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = + VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = device.GetGraphicsFamily(), + }); + pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE); +} + +VkCommandBuffer CommandPool::Commit() { + const size_t index = CommitResource(); + const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE; + const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE; + return pools[pool_index].cmdbufs[sub_index]; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_command_pool.h b/src/video_core/renderer_vulkan/vk_command_pool.h new file mode 100644 index 000000000..61c26a22a --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_command_pool.h @@ -0,0 +1,34 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <vector> + +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +class Device; +class MasterSemaphore; + +class CommandPool final : public ResourcePool { +public: + explicit CommandPool(MasterSemaphore& master_semaphore_, const Device& device_); + ~CommandPool() override; + + void Allocate(size_t begin, size_t end) override; + + VkCommandBuffer Commit(); + +private: + struct Pool; + + const Device& device; + std::vector<Pool> pools; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 182461ed9..2f9a7b028 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -10,131 +10,19 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" +#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - -// Quad array SPIR-V module. Generated from the "shaders/" directory, read the instructions there. -constexpr u8 quad_array[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x29, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x29, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x1c, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x2c, 0x00, 0x09, 0x00, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x37, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x34, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, - 0x49, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x3a, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, - 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, - 0x27, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, - 0x27, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2f, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, - 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x44, 0x00, 0x00, 0x00, - 0x45, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x45, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - -VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() { - return { - .binding = 0, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }; -} - -VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() { - return { - .dstBinding = 0, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 0, - .stride = sizeof(DescriptorUpdateEntry), - }; -} - VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { return { .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, @@ -143,206 +31,6 @@ VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { }; } -// Uint8 SPIR-V module. Generated from the "shaders/" directory. -constexpr u8 uint8_pass[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x2f, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, - 0x51, 0x11, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x61, 0x11, 0x00, 0x00, 0x0a, 0x00, 0x07, 0x00, - 0x53, 0x50, 0x56, 0x5f, 0x4b, 0x48, 0x52, 0x5f, 0x31, 0x36, 0x62, 0x69, 0x74, 0x5f, 0x73, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x07, 0x00, 0x53, 0x50, 0x56, 0x5f, - 0x4b, 0x48, 0x52, 0x5f, 0x38, 0x62, 0x69, 0x74, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, - 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x0a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x1f, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2c, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x44, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, - 0x1a, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0xf7, 0x00, 0x03, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x1c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x71, 0x00, 0x04, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - -// Quad indexed SPIR-V module. Generated from the "shaders/" directory. -constexpr u8 QUAD_INDEXED_SPV[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x7c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x02, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x3f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x09, 0x00, 0x41, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x57, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x70, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, - 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, - 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, - 0xf5, 0x00, 0x07, 0x00, 0x09, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, - 0xf6, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xfa, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x36, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x40, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, - 0xc3, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, - 0x2e, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, - 0x4a, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x54, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x4e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, - 0x5c, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, - 0x5d, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, - 0x6a, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, - 0x62, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x5b, 0x00, 0x00, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBindings() { return {{ { @@ -375,11 +63,11 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { } // Anonymous namespace -VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descriptor_pool, +VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates, - vk::Span<VkPushConstantRange> push_constants, std::size_t code_size, - const u8* code) { + vk::Span<VkPushConstantRange> push_constants, + std::span<const u32> code) { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = nullptr, @@ -387,7 +75,6 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto .bindingCount = bindings.size(), .pBindings = bindings.data(), }); - layout = device.GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = nullptr, @@ -397,7 +84,6 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto .pushConstantRangeCount = push_constants.size(), .pPushConstantRanges = push_constants.data(), }); - if (!templates.empty()) { descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR, @@ -414,18 +100,13 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto descriptor_allocator.emplace(descriptor_pool, *descriptor_set_layout); } - - auto code_copy = std::make_unique<u32[]>(code_size / sizeof(u32) + 1); - std::memcpy(code_copy.get(), code, code_size); - module = device.GetLogical().CreateShaderModule({ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .codeSize = code_size, - .pCode = code_copy.get(), + .codeSize = static_cast<u32>(code.size_bytes()), + .pCode = code.data(), }); - pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, @@ -448,126 +129,70 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto VKComputePass::~VKComputePass() = default; -VkDescriptorSet VKComputePass::CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue, - VKFence& fence) { +VkDescriptorSet VKComputePass::CommitDescriptorSet( + VKUpdateDescriptorQueue& update_descriptor_queue) { if (!descriptor_template) { return nullptr; } - const auto set = descriptor_allocator->Commit(fence); + const VkDescriptorSet set = descriptor_allocator->Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } -QuadArrayPass::QuadArrayPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) - : VKComputePass(device, descriptor_pool, BuildQuadArrayPassDescriptorSetLayoutBinding(), - BuildQuadArrayPassDescriptorUpdateTemplateEntry(), - BuildComputePushConstantRange(sizeof(u32)), std::size(quad_array), quad_array), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} - -QuadArrayPass::~QuadArrayPass() = default; - -std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) { - const u32 num_triangle_vertices = (num_vertices / 4) * 6; - const std::size_t staging_size = num_triangle_vertices * sizeof(u32); - auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); - - update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); - const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence()); - - scheduler.RequestOutsideRenderPassOperationContext(); - - ASSERT(num_vertices % 4 == 0); - const u32 num_quads = num_vertices / 4; - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, num_quads, - first, set](vk::CommandBuffer cmdbuf) { - constexpr u32 dispatch_size = 1024; - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); - cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first); - cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast<VkDeviceSize>(num_quads) * 6 * sizeof(u32); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {}); - }); - return {*buffer.handle, 0}; -} - -Uint8Pass::Uint8Pass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) +Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), - BuildInputOutputDescriptorUpdateTemplate(), {}, std::size(uint8_pass), - uint8_pass), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} + BuildInputOutputDescriptorUpdateTemplate(), {}, VULKAN_UINT8_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_} {} Uint8Pass::~Uint8Pass() = default; -std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, - u64 src_offset) { - const auto staging_size = static_cast<u32>(num_vertices * sizeof(u16)); - auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); +std::pair<VkBuffer, VkDeviceSize> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, + u32 src_offset) { + const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); + const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); - update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); - const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence()); + update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set, + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_vertices](vk::CommandBuffer cmdbuf) { - constexpr u32 dispatch_size = 1024; + static constexpr u32 DISPATCH_SIZE = 1024; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + }; cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); - cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast<VkDeviceSize>(num_vertices * sizeof(u16)); + cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER); }); - return {*buffer.handle, 0}; + return {staging.buffer, staging.offset}; } -QuadIndexedPass::QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) - : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), +QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) + : VKComputePass(device_, descriptor_pool_, BuildInputOutputDescriptorSetBindings(), BuildInputOutputDescriptorUpdateTemplate(), - BuildComputePushConstantRange(sizeof(u32) * 2), std::size(QUAD_INDEXED_SPV), - QUAD_INDEXED_SPV), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} + BuildComputePushConstantRange(sizeof(u32) * 2), VULKAN_QUAD_INDEXED_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_} {} QuadIndexedPass::~QuadIndexedPass() = default; -std::pair<VkBuffer, u64> QuadIndexedPass::Assemble( +std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, - VkBuffer src_buffer, u64 src_offset) { + VkBuffer src_buffer, u32 src_offset) { const u32 index_shift = [index_format] { switch (index_format) { case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte: @@ -584,38 +209,33 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble( const u32 num_tri_vertices = (num_vertices / 4) * 6; const std::size_t staging_size = num_tri_vertices * sizeof(u32); - auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); + const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); - update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); - const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence()); + update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set, + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { - static constexpr u32 dispatch_size = 1024; + static constexpr u32 DISPATCH_SIZE = 1024; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + }; const std::array push_constants = {base_vertex, index_shift}; cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); - cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32)); + cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER); }); - return {*buffer.handle, 0}; + return {staging.buffer, staging.offset}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 230b526bc..17d781d99 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -5,33 +5,31 @@ #pragma once #include <optional> +#include <span> #include <utility> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; -class VKFence; +class Device; +class StagingBufferPool; class VKScheduler; -class VKStagingBufferPool; class VKUpdateDescriptorQueue; class VKComputePass { public: - explicit VKComputePass(const VKDevice& device, VKDescriptorPool& descriptor_pool, + explicit VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates, - vk::Span<VkPushConstantRange> push_constants, std::size_t code_size, - const u8* code); + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); ~VKComputePass(); protected: - VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue, - VKFence& fence); + VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue); vk::DescriptorUpdateTemplateKHR descriptor_template; vk::PipelineLayout layout; @@ -43,52 +41,39 @@ private: vk::ShaderModule module; }; -class QuadArrayPass final : public VKComputePass { -public: - explicit QuadArrayPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); - ~QuadArrayPass(); - - std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first); - -private: - VKScheduler& scheduler; - VKStagingBufferPool& staging_buffer_pool; - VKUpdateDescriptorQueue& update_descriptor_queue; -}; - class Uint8Pass final : public VKComputePass { public: - explicit Uint8Pass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_); ~Uint8Pass(); - std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); + /// Assemble uint8 indices into an uint16 index buffer + /// Returns a pair with the staging buffer, and the offset where the assembled data is + std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, VkBuffer src_buffer, + u32 src_offset); private: VKScheduler& scheduler; - VKStagingBufferPool& staging_buffer_pool; + StagingBufferPool& staging_buffer_pool; VKUpdateDescriptorQueue& update_descriptor_queue; }; class QuadIndexedPass final : public VKComputePass { public: - explicit QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + explicit QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + StagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_); ~QuadIndexedPass(); - std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, - u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, - u64 src_offset); + std::pair<VkBuffer, VkDeviceSize> Assemble( + Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, + u32 base_vertex, VkBuffer src_buffer, u32 src_offset); private: VKScheduler& scheduler; - VKStagingBufferPool& staging_buffer_pool; + StagingBufferPool& staging_buffer_pool; VKUpdateDescriptorQueue& update_descriptor_queue; }; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index ed9d2991c..3a48219b7 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -6,25 +6,25 @@ #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -VKComputePipeline::VKComputePipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - const SPIRVShader& shader) - : device{device}, scheduler{scheduler}, entries{shader.entries}, +VKComputePipeline::VKComputePipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + const SPIRVShader& shader_) + : device{device_}, scheduler{scheduler_}, entries{shader_.entries}, descriptor_set_layout{CreateDescriptorSetLayout()}, - descriptor_allocator{descriptor_pool, *descriptor_set_layout}, - update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()}, + descriptor_allocator{descriptor_pool_, *descriptor_set_layout}, + update_descriptor_queue{update_descriptor_queue_}, layout{CreatePipelineLayout()}, descriptor_template{CreateDescriptorUpdateTemplate()}, - shader_module{CreateShaderModule(shader.code)}, pipeline{CreatePipeline()} {} + shader_module{CreateShaderModule(shader_.code)}, pipeline{CreatePipeline()} {} VKComputePipeline::~VKComputePipeline() = default; @@ -32,7 +32,7 @@ VkDescriptorSet VKComputePipeline::CommitDescriptorSet() { if (!descriptor_template) { return {}; } - const auto set = descriptor_allocator.Commit(scheduler.GetFence()); + const VkDescriptorSet set = descriptor_allocator.Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 6e2f22a4a..7e16575ac 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -7,20 +7,20 @@ #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; class VKUpdateDescriptorQueue; class VKComputePipeline final { public: - explicit VKComputePipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - const SPIRVShader& shader); + explicit VKComputePipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + const SPIRVShader& shader_); ~VKComputePipeline(); VkDescriptorSet CommitDescriptorSet(); @@ -48,7 +48,7 @@ private: vk::Pipeline CreatePipeline() const; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; ShaderEntries entries; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index ac4a0884e..ef9fb5910 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -6,23 +6,25 @@ #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines. constexpr std::size_t SETS_GROW_RATE = 0x20; -DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool, - VkDescriptorSetLayout layout) - : VKFencedPool{SETS_GROW_RATE}, descriptor_pool{descriptor_pool}, layout{layout} {} +DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool_, + VkDescriptorSetLayout layout_) + : ResourcePool(descriptor_pool_.master_semaphore, SETS_GROW_RATE), + descriptor_pool{descriptor_pool_}, layout{layout_} {} DescriptorAllocator::~DescriptorAllocator() = default; -VkDescriptorSet DescriptorAllocator::Commit(VKFence& fence) { - const std::size_t index = CommitResource(fence); +VkDescriptorSet DescriptorAllocator::Commit() { + const std::size_t index = CommitResource(); return descriptors_allocations[index / SETS_GROW_RATE][index % SETS_GROW_RATE]; } @@ -30,8 +32,9 @@ void DescriptorAllocator::Allocate(std::size_t begin, std::size_t end) { descriptors_allocations.push_back(descriptor_pool.AllocateDescriptors(layout, end - begin)); } -VKDescriptorPool::VKDescriptorPool(const VKDevice& device) - : device{device}, active_pool{AllocateNewPool()} {} +VKDescriptorPool::VKDescriptorPool(const Device& device_, VKScheduler& scheduler) + : device{device_}, master_semaphore{scheduler.GetMasterSemaphore()}, active_pool{ + AllocateNewPool()} {} VKDescriptorPool::~VKDescriptorPool() = default; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.h b/src/video_core/renderer_vulkan/vk_descriptor_pool.h index 9efa66bef..f892be7be 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.h +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.h @@ -6,21 +6,24 @@ #include <vector> -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class Device; class VKDescriptorPool; +class VKScheduler; -class DescriptorAllocator final : public VKFencedPool { +class DescriptorAllocator final : public ResourcePool { public: explicit DescriptorAllocator(VKDescriptorPool& descriptor_pool, VkDescriptorSetLayout layout); ~DescriptorAllocator() override; + DescriptorAllocator& operator=(const DescriptorAllocator&) = delete; DescriptorAllocator(const DescriptorAllocator&) = delete; - VkDescriptorSet Commit(VKFence& fence); + VkDescriptorSet Commit(); protected: void Allocate(std::size_t begin, std::size_t end) override; @@ -36,15 +39,19 @@ class VKDescriptorPool final { friend DescriptorAllocator; public: - explicit VKDescriptorPool(const VKDevice& device); + explicit VKDescriptorPool(const Device& device, VKScheduler& scheduler); ~VKDescriptorPool(); + VKDescriptorPool(const VKDescriptorPool&) = delete; + VKDescriptorPool& operator=(const VKDescriptorPool&) = delete; + private: vk::DescriptorPool* AllocateNewPool(); vk::DescriptorSets AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count); - const VKDevice& device; + const Device& device; + MasterSemaphore& master_semaphore; std::vector<vk::DescriptorPool> pools; vk::DescriptorPool* active_pool; diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index d7f65d435..3bec48d14 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -3,23 +3,21 @@ // Refer to the license.txt file included. #include <memory> -#include <thread> #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, bool is_stubbed) - : VideoCommon::FenceBase(payload, is_stubbed), device{device}, scheduler{scheduler} {} +InnerFence::InnerFence(VKScheduler& scheduler_, u32 payload_, bool is_stubbed_) + : FenceBase{payload_, is_stubbed_}, scheduler{scheduler_} {} -InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, - u32 payload, bool is_stubbed) - : VideoCommon::FenceBase(address, payload, is_stubbed), device{device}, scheduler{scheduler} {} +InnerFence::InnerFence(VKScheduler& scheduler_, GPUVAddr address_, u32 payload_, bool is_stubbed_) + : FenceBase{address_, payload_, is_stubbed_}, scheduler{scheduler_} {} InnerFence::~InnerFence() = default; @@ -27,63 +25,38 @@ void InnerFence::Queue() { if (is_stubbed) { return; } - ASSERT(!event); - - event = device.GetLogical().CreateNewEvent(); - ticks = scheduler.Ticks(); - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([event = *event](vk::CommandBuffer cmdbuf) { - cmdbuf.SetEvent(event, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); - }); + // Get the current tick so we can wait for it + wait_tick = scheduler.CurrentTick(); + scheduler.Flush(); } bool InnerFence::IsSignaled() const { if (is_stubbed) { return true; } - ASSERT(event); - return IsEventSignalled(); + return scheduler.IsFree(wait_tick); } void InnerFence::Wait() { if (is_stubbed) { return; } - ASSERT(event); - - if (ticks >= scheduler.Ticks()) { - scheduler.Flush(); - } - while (!IsEventSignalled()) { - std::this_thread::yield(); - } -} - -bool InnerFence::IsEventSignalled() const { - switch (const VkResult result = event.GetStatus()) { - case VK_EVENT_SET: - return true; - case VK_EVENT_RESET: - return false; - default: - throw vk::Exception(result); - } + scheduler.Wait(wait_tick); } -VKFenceManager::VKFenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKTextureCache& texture_cache, VKBufferCache& buffer_cache, - VKQueryCache& query_cache) - : GenericFenceManager(system, rasterizer, texture_cache, buffer_cache, query_cache), - device{device}, scheduler{scheduler} {} +VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + TextureCache& texture_cache_, BufferCache& buffer_cache_, + VKQueryCache& query_cache_, const Device& device_, + VKScheduler& scheduler_) + : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_}, + scheduler{scheduler_} {} Fence VKFenceManager::CreateFence(u32 value, bool is_stubbed) { - return std::make_shared<InnerFence>(device, scheduler, value, is_stubbed); + return std::make_shared<InnerFence>(scheduler, value, is_stubbed); } Fence VKFenceManager::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) { - return std::make_shared<InnerFence>(device, scheduler, addr, value, is_stubbed); + return std::make_shared<InnerFence>(scheduler, addr, value, is_stubbed); } void VKFenceManager::QueueFence(Fence& fence) { diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 043fe7947..2f8322d29 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -8,7 +8,8 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -20,18 +21,14 @@ class RasterizerInterface; namespace Vulkan { -class VKBufferCache; -class VKDevice; +class Device; class VKQueryCache; class VKScheduler; -class VKTextureCache; class InnerFence : public VideoCommon::FenceBase { public: - explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, - bool is_stubbed); - explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, - u32 payload, bool is_stubbed); + explicit InnerFence(VKScheduler& scheduler_, u32 payload_, bool is_stubbed_); + explicit InnerFence(VKScheduler& scheduler_, GPUVAddr address_, u32 payload_, bool is_stubbed_); ~InnerFence(); void Queue(); @@ -41,24 +38,20 @@ public: void Wait(); private: - bool IsEventSignalled() const; - - const VKDevice& device; VKScheduler& scheduler; - vk::Event event; - u64 ticks = 0; + u64 wait_tick = 0; }; using Fence = std::shared_ptr<InnerFence>; using GenericFenceManager = - VideoCommon::FenceManager<Fence, VKTextureCache, VKBufferCache, VKQueryCache>; + VideoCommon::FenceManager<Fence, TextureCache, BufferCache, VKQueryCache>; class VKFenceManager final : public GenericFenceManager { public: - explicit VKFenceManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKTextureCache& texture_cache, VKBufferCache& buffer_cache, - VKQueryCache& query_cache); + explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCache& texture_cache, BufferCache& buffer_cache, + VKQueryCache& query_cache, const Device& device, + VKScheduler& scheduler); protected: Fence CreateFence(u32 value, bool is_stubbed) override; @@ -68,7 +61,6 @@ protected: void WaitFence(Fence& fence) override; private: - const VKDevice& device; VKScheduler& scheduler; }; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 2e46c6278..fc6dd83eb 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -12,13 +12,12 @@ #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -69,23 +68,45 @@ VkViewportSwizzleNV UnpackViewportSwizzle(u16 swizzle) { }; } +VkSampleCountFlagBits ConvertMsaaMode(Tegra::Texture::MsaaMode msaa_mode) { + switch (msaa_mode) { + case Tegra::Texture::MsaaMode::Msaa1x1: + return VK_SAMPLE_COUNT_1_BIT; + case Tegra::Texture::MsaaMode::Msaa2x1: + case Tegra::Texture::MsaaMode::Msaa2x1_D3D: + return VK_SAMPLE_COUNT_2_BIT; + case Tegra::Texture::MsaaMode::Msaa2x2: + case Tegra::Texture::MsaaMode::Msaa2x2_VC4: + case Tegra::Texture::MsaaMode::Msaa2x2_VC12: + return VK_SAMPLE_COUNT_4_BIT; + case Tegra::Texture::MsaaMode::Msaa4x2: + case Tegra::Texture::MsaaMode::Msaa4x2_D3D: + case Tegra::Texture::MsaaMode::Msaa4x2_VC8: + case Tegra::Texture::MsaaMode::Msaa4x2_VC24: + return VK_SAMPLE_COUNT_8_BIT; + case Tegra::Texture::MsaaMode::Msaa4x4: + return VK_SAMPLE_COUNT_16_BIT; + default: + UNREACHABLE_MSG("Invalid msaa_mode={}", static_cast<int>(msaa_mode)); + return VK_SAMPLE_COUNT_1_BIT; + } +} + } // Anonymous namespace -VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache, +VKGraphicsPipeline::VKGraphicsPipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, const GraphicsPipelineCacheKey& key, vk::Span<VkDescriptorSetLayoutBinding> bindings, - const SPIRVProgram& program) - : device{device}, scheduler{scheduler}, cache_key{key}, hash{cache_key.Hash()}, + const SPIRVProgram& program, u32 num_color_buffers) + : device{device_}, scheduler{scheduler_}, cache_key{key}, hash{cache_key.Hash()}, descriptor_set_layout{CreateDescriptorSetLayout(bindings)}, - descriptor_allocator{descriptor_pool, *descriptor_set_layout}, - update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()}, - descriptor_template{CreateDescriptorUpdateTemplate(program)}, modules{CreateShaderModules( - program)}, - renderpass{renderpass_cache.GetRenderPass(cache_key.renderpass_params)}, - pipeline{CreatePipeline(cache_key.renderpass_params, program)} {} + descriptor_allocator{descriptor_pool_, *descriptor_set_layout}, + update_descriptor_queue{update_descriptor_queue_}, layout{CreatePipelineLayout()}, + descriptor_template{CreateDescriptorUpdateTemplate(program)}, + modules(CreateShaderModules(program)), + pipeline(CreatePipeline(program, cache_key.renderpass, num_color_buffers)) {} VKGraphicsPipeline::~VKGraphicsPipeline() = default; @@ -93,7 +114,7 @@ VkDescriptorSet VKGraphicsPipeline::CommitDescriptorSet() { if (!descriptor_template) { return {}; } - const auto set = descriptor_allocator.Commit(scheduler.GetFence()); + const VkDescriptorSet set = descriptor_allocator.Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } @@ -159,10 +180,12 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, + .codeSize = 0, + .pCode = nullptr, }; - std::vector<vk::ShaderModule> modules; - modules.reserve(Maxwell::MaxShaderStage); + std::vector<vk::ShaderModule> shader_modules; + shader_modules.reserve(Maxwell::MaxShaderStage); for (std::size_t i = 0; i < Maxwell::MaxShaderStage; ++i) { const auto& stage = program[i]; if (!stage) { @@ -173,13 +196,14 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( ci.codeSize = stage->code.size() * sizeof(u32); ci.pCode = stage->code.data(); - modules.push_back(device.GetLogical().CreateShaderModule(ci)); + shader_modules.push_back(device.GetLogical().CreateShaderModule(ci)); } - return modules; + return shader_modules; } -vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpass_params, - const SPIRVProgram& program) const { +vk::Pipeline VKGraphicsPipeline::CreatePipeline(const SPIRVProgram& program, + VkRenderPass renderpass, + u32 num_color_buffers) const { const auto& state = cache_key.fixed_state; const auto& viewport_swizzles = state.viewport_swizzles; @@ -189,11 +213,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa // state is ignored dynamic.raw1 = 0; dynamic.raw2 = 0; - for (FixedPipelineState::VertexBinding& binding : dynamic.vertex_bindings) { - // Enable all vertex bindings - binding.raw = 0; - binding.enabled.Assign(1); - } + dynamic.vertex_strides.fill(0); } else { dynamic = state.dynamic_state; } @@ -201,19 +221,13 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa std::vector<VkVertexInputBindingDescription> vertex_bindings; std::vector<VkVertexInputBindingDivisorDescriptionEXT> vertex_binding_divisors; for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& binding = dynamic.vertex_bindings[index]; - if (!binding.enabled) { - continue; - } const bool instanced = state.binding_divisors[index] != 0; const auto rate = instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX; - vertex_bindings.push_back({ .binding = static_cast<u32>(index), - .stride = binding.stride, + .stride = dynamic.vertex_strides[index], .inputRate = rate, }); - if (instanced) { vertex_binding_divisors.push_back({ .binding = static_cast<u32>(index), @@ -229,7 +243,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa if (!attribute.enabled) { continue; } - if (input_attributes.find(static_cast<u32>(index)) == input_attributes.end()) { + if (!input_attributes.contains(static_cast<u32>(index))) { // Skip attributes not used by the vertex shaders. continue; } @@ -261,12 +275,12 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa vertex_input_ci.pNext = &input_divisor_ci; } - const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology()); + const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, state.topology); const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology()), + .topology = MaxwellToVK::PrimitiveTopology(device, state.topology), .primitiveRestartEnable = state.primitive_restart_enable != 0 && SupportsPrimitiveRestart(input_assembly_topology), }; @@ -289,8 +303,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa }; std::array<VkViewportSwizzleNV, Maxwell::NumViewports> swizzles; - std::transform(viewport_swizzles.begin(), viewport_swizzles.end(), swizzles.begin(), - UnpackViewportSwizzle); + std::ranges::transform(viewport_swizzles, swizzles.begin(), UnpackViewportSwizzle); VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV, .pNext = nullptr, @@ -311,8 +324,8 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .rasterizerDiscardEnable = static_cast<VkBool32>(state.rasterize_enable == 0 ? VK_TRUE : VK_FALSE), .polygonMode = VK_POLYGON_MODE_FILL, - .cullMode = - dynamic.cull_enable ? MaxwellToVK::CullFace(dynamic.CullFace()) : VK_CULL_MODE_NONE, + .cullMode = static_cast<VkCullModeFlags>( + dynamic.cull_enable ? MaxwellToVK::CullFace(dynamic.CullFace()) : VK_CULL_MODE_NONE), .frontFace = MaxwellToVK::FrontFace(dynamic.FrontFace()), .depthBiasEnable = state.depth_bias_enable, .depthBiasConstantFactor = 0.0f, @@ -325,7 +338,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .rasterizationSamples = ConvertMsaaMode(state.msaa_mode), .sampleShadingEnable = VK_FALSE, .minSampleShading = 0.0f, .pSampleMask = nullptr, @@ -351,8 +364,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa }; std::array<VkPipelineColorBlendAttachmentState, Maxwell::NumRenderTargets> cb_attachments; - const auto num_attachments = static_cast<std::size_t>(renderpass_params.num_color_attachments); - for (std::size_t index = 0; index < num_attachments; ++index) { + for (std::size_t index = 0; index < num_color_buffers; ++index) { static constexpr std::array COMPONENT_TABLE{ VK_COLOR_COMPONENT_R_BIT, VK_COLOR_COMPONENT_G_BIT, @@ -386,8 +398,9 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .flags = 0, .logicOpEnable = VK_FALSE, .logicOp = VK_LOGIC_OP_COPY, - .attachmentCount = static_cast<u32>(num_attachments), + .attachmentCount = num_color_buffers, .pAttachments = cb_attachments.data(), + .blendConstants = {}, }; std::vector dynamic_states{ @@ -400,7 +413,6 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa static constexpr std::array extended{ VK_DYNAMIC_STATE_CULL_MODE_EXT, VK_DYNAMIC_STATE_FRONT_FACE_EXT, - VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT, VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT, VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT, @@ -446,8 +458,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa stage_ci.pNext = &subgroup_size_ci; } } - - const VkGraphicsPipelineCreateInfo ci{ + return device.GetLogical().CreateGraphicsPipeline(VkGraphicsPipelineCreateInfo{ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -467,8 +478,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .subpass = 0, .basePipelineHandle = nullptr, .basePipelineIndex = 0, - }; - return device.GetLogical().CreateGraphicsPipeline(ci); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 58aa35efd..8b6a98fe0 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -8,20 +8,19 @@ #include <optional> #include <vector> +#include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct GraphicsPipelineCacheKey { - RenderPassParams renderpass_params; - u32 padding; + VkRenderPass renderpass; std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders; FixedPipelineState fixed_state; @@ -34,16 +33,15 @@ struct GraphicsPipelineCacheKey { } std::size_t Size() const noexcept { - return sizeof(renderpass_params) + sizeof(padding) + sizeof(shaders) + fixed_state.Size(); + return sizeof(renderpass) + sizeof(shaders) + fixed_state.Size(); } }; static_assert(std::has_unique_object_representations_v<GraphicsPipelineCacheKey>); static_assert(std::is_trivially_copyable_v<GraphicsPipelineCacheKey>); static_assert(std::is_trivially_constructible_v<GraphicsPipelineCacheKey>); +class Device; class VKDescriptorPool; -class VKDevice; -class VKRenderPassCache; class VKScheduler; class VKUpdateDescriptorQueue; @@ -51,13 +49,12 @@ using SPIRVProgram = std::array<std::optional<SPIRVShader>, Maxwell::MaxShaderSt class VKGraphicsPipeline final { public: - explicit VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler, + explicit VKGraphicsPipeline(const Device& device_, VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache, + VKUpdateDescriptorQueue& update_descriptor_queue_, const GraphicsPipelineCacheKey& key, vk::Span<VkDescriptorSetLayoutBinding> bindings, - const SPIRVProgram& program); + const SPIRVProgram& program, u32 num_color_buffers); ~VKGraphicsPipeline(); VkDescriptorSet CommitDescriptorSet(); @@ -70,10 +67,6 @@ public: return *layout; } - VkRenderPass GetRenderPass() const { - return renderpass; - } - GraphicsPipelineCacheKey GetCacheKey() const { return cache_key; } @@ -89,10 +82,10 @@ private: std::vector<vk::ShaderModule> CreateShaderModules(const SPIRVProgram& program) const; - vk::Pipeline CreatePipeline(const RenderPassParams& renderpass_params, - const SPIRVProgram& program) const; + vk::Pipeline CreatePipeline(const SPIRVProgram& program, VkRenderPass renderpass, + u32 num_color_buffers) const; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; const GraphicsPipelineCacheKey cache_key; const u64 hash; @@ -104,7 +97,6 @@ private: vk::DescriptorUpdateTemplateKHR descriptor_template; std::vector<vk::ShaderModule> modules; - VkRenderPass renderpass; vk::Pipeline pipeline; }; diff --git a/src/video_core/renderer_vulkan/vk_image.cpp b/src/video_core/renderer_vulkan/vk_image.cpp deleted file mode 100644 index 1c418ea17..000000000 --- a/src/video_core/renderer_vulkan/vk_image.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <memory> -#include <vector> - -#include "common/assert.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_image.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -VKImage::VKImage(const VKDevice& device, VKScheduler& scheduler, const VkImageCreateInfo& image_ci, - VkImageAspectFlags aspect_mask) - : device{device}, scheduler{scheduler}, format{image_ci.format}, aspect_mask{aspect_mask}, - image_num_layers{image_ci.arrayLayers}, image_num_levels{image_ci.mipLevels} { - UNIMPLEMENTED_IF_MSG(image_ci.queueFamilyIndexCount != 0, - "Queue family tracking is not implemented"); - - image = device.GetLogical().CreateImage(image_ci); - - const u32 num_ranges = image_num_layers * image_num_levels; - barriers.resize(num_ranges); - subrange_states.resize(num_ranges, {{}, image_ci.initialLayout}); -} - -VKImage::~VKImage() = default; - -void VKImage::Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - if (!HasChanged(base_layer, num_layers, base_level, num_levels, new_access, new_layout)) { - return; - } - - std::size_t cursor = 0; - for (u32 layer_it = 0; layer_it < num_layers; ++layer_it) { - for (u32 level_it = 0; level_it < num_levels; ++level_it, ++cursor) { - const u32 layer = base_layer + layer_it; - const u32 level = base_level + level_it; - auto& state = GetSubrangeState(layer, level); - auto& barrier = barriers[cursor]; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = state.access; - barrier.dstAccessMask = new_access; - barrier.oldLayout = state.layout; - barrier.newLayout = new_layout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = *image; - barrier.subresourceRange.aspectMask = aspect_mask; - barrier.subresourceRange.baseMipLevel = level; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = layer; - barrier.subresourceRange.layerCount = 1; - state.access = new_access; - state.layout = new_layout; - } - } - - scheduler.RequestOutsideRenderPassOperationContext(); - - scheduler.Record([barriers = barriers, cursor](vk::CommandBuffer cmdbuf) { - // TODO(Rodrigo): Implement a way to use the latest stage across subresources. - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, {}, {}, - vk::Span(barriers.data(), cursor)); - }); -} - -bool VKImage::HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkAccessFlags new_access, VkImageLayout new_layout) noexcept { - const bool is_full_range = base_layer == 0 && num_layers == image_num_layers && - base_level == 0 && num_levels == image_num_levels; - if (!is_full_range) { - state_diverged = true; - } - - if (!state_diverged) { - auto& state = GetSubrangeState(0, 0); - if (state.access != new_access || state.layout != new_layout) { - return true; - } - } - - for (u32 layer_it = 0; layer_it < num_layers; ++layer_it) { - for (u32 level_it = 0; level_it < num_levels; ++level_it) { - const u32 layer = base_layer + layer_it; - const u32 level = base_level + level_it; - auto& state = GetSubrangeState(layer, level); - if (state.access != new_access || state.layout != new_layout) { - return true; - } - } - } - return false; -} - -void VKImage::CreatePresentView() { - // Image type has to be 2D to be presented. - present_view = device.GetLogical().CreateImageView({ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = *image, - .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = format, - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange = - { - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1, - }, - }); -} - -VKImage::SubrangeState& VKImage::GetSubrangeState(u32 layer, u32 level) noexcept { - return subrange_states[static_cast<std::size_t>(layer * image_num_levels) + - static_cast<std::size_t>(level)]; -} - -} // namespace Vulkan
\ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_image.h b/src/video_core/renderer_vulkan/vk_image.h deleted file mode 100644 index b4d7229e5..000000000 --- a/src/video_core/renderer_vulkan/vk_image.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <vector> - -#include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -class VKDevice; -class VKScheduler; - -class VKImage { -public: - explicit VKImage(const VKDevice& device, VKScheduler& scheduler, - const VkImageCreateInfo& image_ci, VkImageAspectFlags aspect_mask); - ~VKImage(); - - /// Records in the passed command buffer an image transition and updates the state of the image. - void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout); - - /// Returns a view compatible with presentation, the image has to be 2D. - VkImageView GetPresentView() { - if (!present_view) { - CreatePresentView(); - } - return *present_view; - } - - /// Returns the Vulkan image handler. - const vk::Image& GetHandle() const { - return image; - } - - /// Returns the Vulkan format for this image. - VkFormat GetFormat() const { - return format; - } - - /// Returns the Vulkan aspect mask. - VkImageAspectFlags GetAspectMask() const { - return aspect_mask; - } - -private: - struct SubrangeState final { - VkAccessFlags access = 0; ///< Current access bits. - VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED; ///< Current image layout. - }; - - bool HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkAccessFlags new_access, VkImageLayout new_layout) noexcept; - - /// Creates a presentation view. - void CreatePresentView(); - - /// Returns the subrange state for a layer and layer. - SubrangeState& GetSubrangeState(u32 layer, u32 level) noexcept; - - const VKDevice& device; ///< Device handler. - VKScheduler& scheduler; ///< Device scheduler. - - const VkFormat format; ///< Vulkan format. - const VkImageAspectFlags aspect_mask; ///< Vulkan aspect mask. - const u32 image_num_layers; ///< Number of layers. - const u32 image_num_levels; ///< Number of mipmap levels. - - vk::Image image; ///< Image handle. - vk::ImageView present_view; ///< Image view compatible with presentation. - - std::vector<VkImageMemoryBarrier> barriers; ///< Pool of barriers. - std::vector<SubrangeState> subrange_states; ///< Current subrange state. - - bool state_diverged = false; ///< True when subresources mismatch in layout. -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp new file mode 100644 index 000000000..56ec5e380 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -0,0 +1,56 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <atomic> +#include <chrono> + +#include "core/settings.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +using namespace std::chrono_literals; + +MasterSemaphore::MasterSemaphore(const Device& device) { + static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR, + .pNext = nullptr, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR, + .initialValue = 0, + }; + static constexpr VkSemaphoreCreateInfo semaphore_ci{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &semaphore_type_ci, + .flags = 0, + }; + semaphore = device.GetLogical().CreateSemaphore(semaphore_ci); + + if (!Settings::values.renderer_debug) { + return; + } + // Validation layers have a bug where they fail to track resource usage when using timeline + // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have + // a separate thread waiting for each timeline semaphore value. + debug_thread = std::thread([this] { + u64 counter = 0; + while (!shutdown) { + if (semaphore.Wait(counter, 10'000'000)) { + ++counter; + } + } + }); +} + +MasterSemaphore::~MasterSemaphore() { + shutdown = true; + + // This thread might not be started + if (debug_thread.joinable()) { + debug_thread.join(); + } +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h new file mode 100644 index 000000000..2c7ed654d --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -0,0 +1,75 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <atomic> +#include <thread> + +#include "common/common_types.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +class Device; + +class MasterSemaphore { +public: + explicit MasterSemaphore(const Device& device); + ~MasterSemaphore(); + + /// Returns the current logical tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return current_tick.load(std::memory_order_relaxed); + } + + /// Returns the last known GPU tick. + [[nodiscard]] u64 KnownGpuTick() const noexcept { + return gpu_tick.load(std::memory_order_relaxed); + } + + /// Returns the timeline semaphore handle. + [[nodiscard]] VkSemaphore Handle() const noexcept { + return *semaphore; + } + + /// Returns true when a tick has been hit by the GPU. + [[nodiscard]] bool IsFree(u64 tick) { + return gpu_tick.load(std::memory_order_relaxed) >= tick; + } + + /// Advance to the logical tick. + void NextTick() noexcept { + ++current_tick; + } + + /// Refresh the known GPU tick + void Refresh() { + gpu_tick.store(semaphore.GetCounter(), std::memory_order_relaxed); + } + + /// Waits for a tick to be hit on the GPU + void Wait(u64 tick) { + // No need to wait if the GPU is ahead of the tick + if (IsFree(tick)) { + return; + } + // Update the GPU tick and try again + Refresh(); + if (IsFree(tick)) { + return; + } + // If none of the above is hit, fallback to a regular wait + semaphore.Wait(tick); + } + +private: + vk::Semaphore semaphore; ///< Timeline semaphore. + std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. + std::atomic<u64> current_tick{1}; ///< Current logical tick. + std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed. + std::thread debug_thread; ///< Debug thread to workaround validation layer bugs. +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.cpp b/src/video_core/renderer_vulkan/vk_memory_manager.cpp deleted file mode 100644 index 24c8960ac..000000000 --- a/src/video_core/renderer_vulkan/vk_memory_manager.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <optional> -#include <tuple> -#include <vector> - -#include "common/alignment.h" -#include "common/assert.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -namespace { - -u64 GetAllocationChunkSize(u64 required_size) { - static constexpr u64 sizes[] = {16ULL << 20, 32ULL << 20, 64ULL << 20, 128ULL << 20}; - auto it = std::lower_bound(std::begin(sizes), std::end(sizes), required_size); - return it != std::end(sizes) ? *it : Common::AlignUp(required_size, 256ULL << 20); -} - -} // Anonymous namespace - -class VKMemoryAllocation final { -public: - explicit VKMemoryAllocation(const VKDevice& device, vk::DeviceMemory memory, - VkMemoryPropertyFlags properties, u64 allocation_size, u32 type) - : device{device}, memory{std::move(memory)}, properties{properties}, - allocation_size{allocation_size}, shifted_type{ShiftType(type)} {} - - VKMemoryCommit Commit(VkDeviceSize commit_size, VkDeviceSize alignment) { - auto found = TryFindFreeSection(free_iterator, allocation_size, - static_cast<u64>(commit_size), static_cast<u64>(alignment)); - if (!found) { - found = TryFindFreeSection(0, free_iterator, static_cast<u64>(commit_size), - static_cast<u64>(alignment)); - if (!found) { - // Signal out of memory, it'll try to do more allocations. - return nullptr; - } - } - auto commit = std::make_unique<VKMemoryCommitImpl>(device, this, memory, *found, - *found + commit_size); - commits.push_back(commit.get()); - - // Last commit's address is highly probable to be free. - free_iterator = *found + commit_size; - - return commit; - } - - void Free(const VKMemoryCommitImpl* commit) { - ASSERT(commit); - - const auto it = std::find(std::begin(commits), std::end(commits), commit); - if (it == commits.end()) { - UNREACHABLE_MSG("Freeing unallocated commit!"); - return; - } - commits.erase(it); - } - - /// Returns whether this allocation is compatible with the arguments. - bool IsCompatible(VkMemoryPropertyFlags wanted_properties, u32 type_mask) const { - return (wanted_properties & properties) && (type_mask & shifted_type) != 0; - } - -private: - static constexpr u32 ShiftType(u32 type) { - return 1U << type; - } - - /// A memory allocator, it may return a free region between "start" and "end" with the solicited - /// requirements. - std::optional<u64> TryFindFreeSection(u64 start, u64 end, u64 size, u64 alignment) const { - u64 iterator = Common::AlignUp(start, alignment); - while (iterator + size <= end) { - const u64 try_left = iterator; - const u64 try_right = try_left + size; - - bool overlap = false; - for (const auto& commit : commits) { - const auto [commit_left, commit_right] = commit->interval; - if (try_left < commit_right && commit_left < try_right) { - // There's an overlap, continue the search where the overlapping commit ends. - iterator = Common::AlignUp(commit_right, alignment); - overlap = true; - break; - } - } - if (!overlap) { - // A free address has been found. - return try_left; - } - } - - // No free regions where found, return an empty optional. - return std::nullopt; - } - - const VKDevice& device; ///< Vulkan device. - const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. - const VkMemoryPropertyFlags properties; ///< Vulkan properties. - const u64 allocation_size; ///< Size of this allocation. - const u32 shifted_type; ///< Stored Vulkan type of this allocation, shifted. - - /// Hints where the next free region is likely going to be. - u64 free_iterator{}; - - /// Stores all commits done from this allocation. - std::vector<const VKMemoryCommitImpl*> commits; -}; - -VKMemoryManager::VKMemoryManager(const VKDevice& device) - : device{device}, properties{device.GetPhysical().GetMemoryProperties()} {} - -VKMemoryManager::~VKMemoryManager() = default; - -VKMemoryCommit VKMemoryManager::Commit(const VkMemoryRequirements& requirements, - bool host_visible) { - const u64 chunk_size = GetAllocationChunkSize(requirements.size); - - // When a host visible commit is asked, search for host visible and coherent, otherwise search - // for a fast device local type. - const VkMemoryPropertyFlags wanted_properties = - host_visible ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT - : VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - - if (auto commit = TryAllocCommit(requirements, wanted_properties)) { - return commit; - } - - // Commit has failed, allocate more memory. - if (!AllocMemory(wanted_properties, requirements.memoryTypeBits, chunk_size)) { - // TODO(Rodrigo): Handle these situations in some way like flushing to guest memory. - // Allocation has failed, panic. - UNREACHABLE_MSG("Ran out of VRAM!"); - return {}; - } - - // Commit again, this time it won't fail since there's a fresh allocation above. If it does, - // there's a bug. - auto commit = TryAllocCommit(requirements, wanted_properties); - ASSERT(commit); - return commit; -} - -VKMemoryCommit VKMemoryManager::Commit(const vk::Buffer& buffer, bool host_visible) { - auto commit = Commit(device.GetLogical().GetBufferMemoryRequirements(*buffer), host_visible); - buffer.BindMemory(commit->GetMemory(), commit->GetOffset()); - return commit; -} - -VKMemoryCommit VKMemoryManager::Commit(const vk::Image& image, bool host_visible) { - auto commit = Commit(device.GetLogical().GetImageMemoryRequirements(*image), host_visible); - image.BindMemory(commit->GetMemory(), commit->GetOffset()); - return commit; -} - -bool VKMemoryManager::AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 type_mask, - u64 size) { - const u32 type = [&] { - for (u32 type_index = 0; type_index < properties.memoryTypeCount; ++type_index) { - const auto flags = properties.memoryTypes[type_index].propertyFlags; - if ((type_mask & (1U << type_index)) && (flags & wanted_properties)) { - // The type matches in type and in the wanted properties. - return type_index; - } - } - UNREACHABLE_MSG("Couldn't find a compatible memory type!"); - return 0U; - }(); - - // Try to allocate found type. - vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, - .allocationSize = size, - .memoryTypeIndex = type, - }); - if (!memory) { - LOG_CRITICAL(Render_Vulkan, "Device allocation failed!"); - return false; - } - - allocations.push_back(std::make_unique<VKMemoryAllocation>(device, std::move(memory), - wanted_properties, size, type)); - return true; -} - -VKMemoryCommit VKMemoryManager::TryAllocCommit(const VkMemoryRequirements& requirements, - VkMemoryPropertyFlags wanted_properties) { - for (auto& allocation : allocations) { - if (!allocation->IsCompatible(wanted_properties, requirements.memoryTypeBits)) { - continue; - } - if (auto commit = allocation->Commit(requirements.size, requirements.alignment)) { - return commit; - } - } - return {}; -} - -VKMemoryCommitImpl::VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation, - const vk::DeviceMemory& memory, u64 begin, u64 end) - : device{device}, memory{memory}, interval{begin, end}, allocation{allocation} {} - -VKMemoryCommitImpl::~VKMemoryCommitImpl() { - allocation->Free(this); -} - -MemoryMap VKMemoryCommitImpl::Map(u64 size, u64 offset_) const { - return MemoryMap{this, memory.Map(interval.first + offset_, size)}; -} - -void VKMemoryCommitImpl::Unmap() const { - memory.Unmap(); -} - -MemoryMap VKMemoryCommitImpl::Map() const { - return Map(interval.second - interval.first); -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.h b/src/video_core/renderer_vulkan/vk_memory_manager.h deleted file mode 100644 index 1af88e3d4..000000000 --- a/src/video_core/renderer_vulkan/vk_memory_manager.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <utility> -#include <vector> -#include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -class MemoryMap; -class VKDevice; -class VKMemoryAllocation; -class VKMemoryCommitImpl; - -using VKMemoryCommit = std::unique_ptr<VKMemoryCommitImpl>; - -class VKMemoryManager final { -public: - explicit VKMemoryManager(const VKDevice& device); - VKMemoryManager(const VKMemoryManager&) = delete; - ~VKMemoryManager(); - - /** - * Commits a memory with the specified requeriments. - * @param requirements Requirements returned from a Vulkan call. - * @param host_visible Signals the allocator that it *must* use host visible and coherent - * memory. When passing false, it will try to allocate device local memory. - * @returns A memory commit. - */ - VKMemoryCommit Commit(const VkMemoryRequirements& requirements, bool host_visible); - - /// Commits memory required by the buffer and binds it. - VKMemoryCommit Commit(const vk::Buffer& buffer, bool host_visible); - - /// Commits memory required by the image and binds it. - VKMemoryCommit Commit(const vk::Image& image, bool host_visible); - -private: - /// Allocates a chunk of memory. - bool AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 type_mask, u64 size); - - /// Tries to allocate a memory commit. - VKMemoryCommit TryAllocCommit(const VkMemoryRequirements& requirements, - VkMemoryPropertyFlags wanted_properties); - - const VKDevice& device; ///< Device handler. - const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. - std::vector<std::unique_ptr<VKMemoryAllocation>> allocations; ///< Current allocations. -}; - -class VKMemoryCommitImpl final { - friend VKMemoryAllocation; - friend MemoryMap; - -public: - explicit VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation, - const vk::DeviceMemory& memory, u64 begin, u64 end); - ~VKMemoryCommitImpl(); - - /// Maps a memory region and returns a pointer to it. - /// It's illegal to have more than one memory map at the same time. - MemoryMap Map(u64 size, u64 offset = 0) const; - - /// Maps the whole commit and returns a pointer to it. - /// It's illegal to have more than one memory map at the same time. - MemoryMap Map() const; - - /// Returns the Vulkan memory handler. - VkDeviceMemory GetMemory() const { - return *memory; - } - - /// Returns the start position of the commit relative to the allocation. - VkDeviceSize GetOffset() const { - return static_cast<VkDeviceSize>(interval.first); - } - -private: - /// Unmaps memory. - void Unmap() const; - - const VKDevice& device; ///< Vulkan device. - const vk::DeviceMemory& memory; ///< Vulkan device memory handler. - std::pair<u64, u64> interval{}; ///< Interval where the commit exists. - VKMemoryAllocation* allocation{}; ///< Pointer to the large memory allocation. -}; - -/// Holds ownership of a memory map. -class MemoryMap final { -public: - explicit MemoryMap(const VKMemoryCommitImpl* commit, u8* address) - : commit{commit}, address{address} {} - - ~MemoryMap() { - if (commit) { - commit->Unmap(); - } - } - - /// Prematurely releases the memory map. - void Release() { - commit->Unmap(); - commit = nullptr; - } - - /// Returns the address of the memory map. - u8* GetAddress() const { - return address; - } - - /// Returns the address of the memory map; - operator u8*() const { - return address; - } - -private: - const VKMemoryCommitImpl* commit{}; ///< Mapped memory commit. - u8* address{}; ///< Address to the mapped memory. -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index cfdcdd6ab..8991505ca 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -7,6 +7,8 @@ #include <memory> #include <vector> +#include "common/bit_cast.h" +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -17,18 +19,17 @@ #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" #include "video_core/shader/memory_util.h" #include "video_core/shader_cache.h" #include "video_core/shader_notify.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -51,7 +52,9 @@ constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEX constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ - VideoCommon::Shader::CompileDepth::FullDecompile}; + .depth = VideoCommon::Shader::CompileDepth::FullDecompile, + .disable_else_derivation = true, +}; constexpr std::size_t GetStageFromProgram(std::size_t program) { return program == 0 ? 0 : program - 1; @@ -74,7 +77,7 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) { case Maxwell::ShaderProgram::Fragment: return ShaderType::Fragment; default: - UNIMPLEMENTED_MSG("program={}", static_cast<u32>(program)); + UNIMPLEMENTED_MSG("program={}", program); return ShaderType::Vertex; } } @@ -135,64 +138,54 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con return std::memcmp(&rhs, this, sizeof *this) == 0; } -Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VideoCommon::Shader::ProgramCode program_code, u32 main_offset) - : gpu_addr{gpu_addr}, program_code{std::move(program_code)}, - registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, - compiler_settings, registry}, - entries{GenerateShaderEntries(shader_ir)} {} +Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine_, ShaderType stage_, + GPUVAddr gpu_addr_, VAddr cpu_addr_, ProgramCode program_code_, u32 main_offset_) + : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage_, engine_), + shader_ir(program_code, main_offset_, compiler_settings, registry), + entries(GenerateShaderEntries(shader_ir)) {} Shader::~Shader() = default; -Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system, - Tegra::Engines::ShaderType stage) { - if (stage == ShaderType::Compute) { - return system.GPU().KeplerCompute(); - } else { - return system.GPU().Maxwell3D(); - } -} - -VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache) - : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device}, - scheduler{scheduler}, descriptor_pool{descriptor_pool}, - update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {} +VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer_, Tegra::GPU& gpu_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_, + VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) + : VideoCommon::ShaderCache<Shader>{rasterizer_}, gpu{gpu_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, device{device_}, + scheduler{scheduler_}, descriptor_pool{descriptor_pool_}, update_descriptor_queue{ + update_descriptor_queue_} {} VKPipelineCache::~VKPipelineCache() = default; std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { - const auto& gpu = system.GPU().Maxwell3D(); - std::array<Shader*, Maxwell::MaxShaderProgram> shaders{}; + for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { continue; } - auto& memory_manager{system.GPU().MemoryManager()}; - const GPUVAddr program_addr{GetShaderAddress(system, program)}; - const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); + const GPUVAddr gpu_addr{GetShaderAddress(maxwell3d, program)}; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); if (!result) { - const auto host_ptr{memory_manager.GetPointer(program_addr)}; + const u8* const host_ptr{gpu_memory.GetPointer(gpu_addr)}; // No shader found - create a new one - constexpr u32 stage_offset = STAGE_MAIN_OFFSET; + static constexpr u32 stage_offset = STAGE_MAIN_OFFSET; const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1); - ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); + ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, false); const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code), - stage_offset); + auto shader = std::make_unique<Shader>(maxwell3d, stage, gpu_addr, *cpu_addr, + std::move(code), stage_offset); result = shader.get(); if (cpu_addr) { @@ -207,7 +200,8 @@ std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { } VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( - const GraphicsPipelineCacheKey& key, VideoCommon::Shader::AsyncShaders& async_shaders) { + const GraphicsPipelineCacheKey& key, u32 num_color_buffers, + VideoCommon::Shader::AsyncShaders& async_shaders) { MICROPROFILE_SCOPE(Vulkan_PipelineCache); if (last_graphics_pipeline && last_graphics_key == key) { @@ -215,16 +209,16 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( } last_graphics_key = key; - if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(system.GPU())) { + if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(gpu)) { std::unique_lock lock{pipeline_cache}; const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key); if (is_cache_miss) { - system.GPU().ShaderNotify().MarkSharderBuilding(); + gpu.ShaderNotify().MarkSharderBuilding(); LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); const auto [program, bindings] = DecompileShaders(key.fixed_state); async_shaders.QueueVulkanShader(this, device, scheduler, descriptor_pool, - update_descriptor_queue, renderpass_cache, bindings, - program, key); + update_descriptor_queue, bindings, program, key, + num_color_buffers); } last_graphics_pipeline = pair->second.get(); return last_graphics_pipeline; @@ -233,13 +227,13 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key); auto& entry = pair->second; if (is_cache_miss) { - system.GPU().ShaderNotify().MarkSharderBuilding(); + gpu.ShaderNotify().MarkSharderBuilding(); LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); const auto [program, bindings] = DecompileShaders(key.fixed_state); entry = std::make_unique<VKGraphicsPipeline>(device, scheduler, descriptor_pool, - update_descriptor_queue, renderpass_cache, key, - bindings, program); - system.GPU().ShaderNotify().MarkShaderComplete(); + update_descriptor_queue, key, bindings, + program, num_color_buffers); + gpu.ShaderNotify().MarkShaderComplete(); } last_graphics_pipeline = entry.get(); return last_graphics_pipeline; @@ -255,22 +249,21 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach } LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); - auto& memory_manager = system.GPU().MemoryManager(); - const auto program_addr = key.shader; + const GPUVAddr gpu_addr = key.shader; - const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get(); if (!shader) { // No shader found - create a new one - const auto host_ptr = memory_manager.GetPointer(program_addr); + const auto host_ptr = gpu_memory.GetPointer(gpu_addr); - ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); + ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, true); const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr, - std::move(code), KERNEL_MAIN_OFFSET); + auto shader_info = std::make_unique<Shader>(kepler_compute, ShaderType::Compute, gpu_addr, + *cpu_addr, std::move(code), KERNEL_MAIN_OFFSET); shader = shader_info.get(); if (cpu_addr) { @@ -298,7 +291,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach } void VKPipelineCache::EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline) { - system.GPU().ShaderNotify().MarkShaderComplete(); + gpu.ShaderNotify().MarkShaderComplete(); std::unique_lock lock{pipeline_cache}; graphics_cache.at(pipeline->GetCacheKey()) = std::move(pipeline); } @@ -339,12 +332,8 @@ void VKPipelineCache::OnShaderRemoval(Shader* shader) { std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { - auto& memory_manager = system.GPU().MemoryManager(); - const auto& gpu = system.GPU().Maxwell3D(); - Specialization specialization; - if (fixed_state.dynamic_state.Topology() == Maxwell::PrimitiveTopology::Points || - device.IsExtExtendedDynamicStateSupported()) { + if (fixed_state.topology == Maxwell::PrimitiveTopology::Points) { float point_size; std::memcpy(&point_size, &fixed_state.point_size, sizeof(float)); specialization.point_size = point_size; @@ -356,20 +345,24 @@ VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { specialization.attribute_types[i] = attribute.Type(); } specialization.ndc_minus_one_to_one = fixed_state.ndc_minus_one_to_one; + specialization.early_fragment_tests = fixed_state.early_z; + + // Alpha test + specialization.alpha_test_func = + FixedPipelineState::UnpackComparisonOp(fixed_state.alpha_test_func.Value()); + specialization.alpha_test_ref = Common::BitCast<float>(fixed_state.alpha_test_ref); SPIRVProgram program; std::vector<VkDescriptorSetLayoutBinding> bindings; - for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { + for (std::size_t index = 1; index < Maxwell::MaxShaderProgram; ++index) { const auto program_enum = static_cast<Maxwell::ShaderProgram>(index); - // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { continue; } - - const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); - const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + const GPUVAddr gpu_addr = GetShaderAddress(maxwell3d, program_enum); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 @@ -377,12 +370,8 @@ VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { const auto& entries = shader->GetEntries(); program[stage] = { Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), - entries}; - - if (program_enum == Maxwell::ShaderProgram::VertexA) { - // VertexB was combined with VertexA, so we skip the VertexB iteration - ++index; - } + entries, + }; const u32 old_binding = specialization.base_binding; specialization.base_binding = diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index c04829e77..89d635a3d 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -19,14 +19,13 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/async_shaders.h" #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" #include "video_core/shader_cache.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -34,11 +33,10 @@ class System; namespace Vulkan { +class Device; class RasterizerVulkan; class VKComputePipeline; class VKDescriptorPool; -class VKDevice; -class VKFence; class VKScheduler; class VKUpdateDescriptorQueue; @@ -85,8 +83,9 @@ namespace Vulkan { class Shader { public: - explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine_, + Tegra::Engines::ShaderType stage_, GPUVAddr gpu_addr, VAddr cpu_addr_, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset_); ~Shader(); GPUVAddr GetGpuAddr() const { @@ -97,22 +96,19 @@ public: return shader_ir; } - const VideoCommon::Shader::Registry& GetRegistry() const { - return registry; - } - const VideoCommon::Shader::ShaderIR& GetIR() const { return shader_ir; } + const VideoCommon::Shader::Registry& GetRegistry() const { + return registry; + } + const ShaderEntries& GetEntries() const { return entries; } private: - static Tegra::Engines::ConstBufferEngineInterface& GetEngine(Core::System& system, - Tegra::Engines::ShaderType stage); - GPUVAddr gpu_addr{}; VideoCommon::Shader::ProgramCode program_code; VideoCommon::Shader::Registry registry; @@ -122,16 +118,18 @@ private: class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> { public: - explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache); + explicit VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::KeplerCompute& kepler_compute, + Tegra::MemoryManager& gpu_memory, const Device& device, + VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, + VKUpdateDescriptorQueue& update_descriptor_queue); ~VKPipelineCache() override; std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders(); VKGraphicsPipeline* GetGraphicsPipeline(const GraphicsPipelineCacheKey& key, + u32 num_color_buffers, VideoCommon::Shader::AsyncShaders& async_shaders); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); @@ -145,12 +143,15 @@ private: std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( const FixedPipelineState& fixed_state); - Core::System& system; - const VKDevice& device; + Tegra::GPU& gpu; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + + const Device& device; VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; - VKRenderPassCache& renderpass_cache; std::unique_ptr<Shader> null_shader; std::unique_ptr<Shader> null_kernel; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 6cd63d090..7cadd5147 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -7,37 +7,35 @@ #include <utility> #include <vector> -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +using VideoCore::QueryType; + namespace { constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; -constexpr VkQueryType GetTarget(VideoCore::QueryType type) { +constexpr VkQueryType GetTarget(QueryType type) { return QUERY_TARGETS[static_cast<std::size_t>(type)]; } } // Anonymous namespace -QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {} +QueryPool::QueryPool(const Device& device_, VKScheduler& scheduler, QueryType type_) + : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} QueryPool::~QueryPool() = default; -void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) { - device = &device_; - type = type_; -} - -std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) { +std::pair<VkQueryPool, u32> QueryPool::Commit() { std::size_t index; do { - index = CommitResource(fence); + index = CommitResource(); } while (usage[index]); usage[index] = true; @@ -47,7 +45,7 @@ std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) { void QueryPool::Allocate(std::size_t begin, std::size_t end) { usage.resize(end); - pools.push_back(device->GetLogical().CreateQueryPool({ + pools.push_back(device.GetLogical().CreateQueryPool({ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -68,32 +66,39 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; } -VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKScheduler& scheduler) - : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, - QueryPool>{system, rasterizer}, - device{device}, scheduler{scheduler} { - for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) { - query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i)); +VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, + const Device& device_, VKScheduler& scheduler_) + : QueryCacheBase{rasterizer_, maxwell3d_, gpu_memory_}, device{device_}, scheduler{scheduler_}, + query_pools{ + QueryPool{device_, scheduler_, QueryType::SamplesPassed}, + } {} + +VKQueryCache::~VKQueryCache() { + // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class + // destructor is called. The query cache should be redesigned to have a proper ownership model + // instead of using shared pointers. + for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { + auto& stream = Stream(static_cast<QueryType>(query_type)); + stream.Update(false); + stream.Reset(); } } -VKQueryCache::~VKQueryCache() = default; - -std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(VideoCore::QueryType type) { - return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence()); +std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(QueryType type) { + return query_pools[static_cast<std::size_t>(type)].Commit(); } -void VKQueryCache::Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query) { +void VKQueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { query_pools[static_cast<std::size_t>(type)].Reserve(query); } -HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type) - : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, - type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} { - const vk::Device* logical = &cache.Device().GetLogical(); - cache.Scheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { +HostCounter::HostCounter(VKQueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + QueryType type_) + : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, + query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { + const vk::Device* logical = &cache.GetDevice().GetLogical(); + cache.GetScheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { logical->ResetQueryPoolEXT(query.first, query.second, 1); cmdbuf.BeginQuery(query.first, query.second, VK_QUERY_CONTROL_PRECISE_BIT); }); @@ -104,26 +109,28 @@ HostCounter::~HostCounter() { } void HostCounter::EndQuery() { - cache.Scheduler().Record( + cache.GetScheduler().Record( [query = query](vk::CommandBuffer cmdbuf) { cmdbuf.EndQuery(query.first, query.second); }); } u64 HostCounter::BlockingQuery() const { - if (ticks >= cache.Scheduler().Ticks()) { - cache.Scheduler().Flush(); + if (tick >= cache.GetScheduler().CurrentTick()) { + cache.GetScheduler().Flush(); } + u64 data; - const VkResult result = cache.Device().GetLogical().GetQueryResults( + const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( query.first, query.second, 1, sizeof(data), &data, sizeof(data), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - switch (result) { + + switch (query_result) { case VK_SUCCESS: return data; case VK_ERROR_DEVICE_LOST: - cache.Device().ReportLoss(); + cache.GetDevice().ReportLoss(); [[fallthrough]]; default: - throw vk::Exception(result); + throw vk::Exception(query_result); } } diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 40119e6d3..7190946b9 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -11,8 +11,8 @@ #include "common/common_types.h" #include "video_core/query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace VideoCore { class RasterizerInterface; @@ -21,21 +21,19 @@ class RasterizerInterface; namespace Vulkan { class CachedQuery; +class Device; class HostCounter; -class VKDevice; class VKQueryCache; class VKScheduler; using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; -class QueryPool final : public VKFencedPool { +class QueryPool final : public ResourcePool { public: - explicit QueryPool(); + explicit QueryPool(const Device& device, VKScheduler& scheduler, VideoCore::QueryType type); ~QueryPool() override; - void Initialize(const VKDevice& device, VideoCore::QueryType type); - - std::pair<VkQueryPool, u32> Commit(VKFence& fence); + std::pair<VkQueryPool, u32> Commit(); void Reserve(std::pair<VkQueryPool, u32> query); @@ -45,42 +43,43 @@ protected: private: static constexpr std::size_t GROW_STEP = 512; - const VKDevice* device = nullptr; - VideoCore::QueryType type = {}; + const Device& device; + const VideoCore::QueryType type; std::vector<vk::QueryPool> pools; std::vector<bool> usage; }; class VKQueryCache final - : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, - QueryPool> { + : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKScheduler& scheduler); + explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, + const Device& device_, VKScheduler& scheduler_); ~VKQueryCache(); std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); - const VKDevice& Device() const noexcept { + const Device& GetDevice() const noexcept { return device; } - VKScheduler& Scheduler() const noexcept { + VKScheduler& GetScheduler() const noexcept { return scheduler; } private: - const VKDevice& device; + const Device& device; VKScheduler& scheduler; + std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { public: - explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type); + explicit HostCounter(VKQueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_); ~HostCounter(); void EndQuery(); @@ -91,13 +90,13 @@ private: VKQueryCache& cache; const VideoCore::QueryType type; const std::pair<VkQueryPool, u32> query; - const u64 ticks; + const u64 tick; }; class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { public: - explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr) - : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {} + explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) + : CachedQueryBase{cpu_addr_, host_ptr_} {} }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ff1b52eab..394a1c4e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -8,8 +8,6 @@ #include <mutex> #include <vector> -#include <boost/container/static_vector.hpp> - #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -19,54 +17,54 @@ #include "core/settings.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader_cache.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; +using VideoCommon::ImageViewId; +using VideoCommon::ImageViewType; MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(255, 192, 192)); MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128)); namespace { +struct DrawParams { + u32 base_instance; + u32 num_instances; + u32 base_vertex; + u32 num_vertices; + bool is_indexed; +}; -constexpr auto ComputeShaderIndex = static_cast<std::size_t>(Tegra::Engines::ShaderType::Compute); +constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute); -VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::size_t index) { +VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index) { const auto& src = regs.viewport_transform[index]; const float width = src.scale_x * 2.0f; const float height = src.scale_y * 2.0f; const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; - VkViewport viewport{ .x = src.translate_x - src.scale_x, .y = src.translate_y - src.scale_y, @@ -75,16 +73,14 @@ VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::si .minDepth = src.translate_z - src.scale_z * reduce_z, .maxDepth = src.translate_z + src.scale_z, }; - if (!device.IsExtDepthRangeUnrestrictedSupported()) { viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f); viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f); } - return viewport; } -VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { +VkRect2D GetScissorState(const Maxwell& regs, size_t index) { const auto& src = regs.scissor_test[index]; VkRect2D scissor; if (src.enable) { @@ -104,305 +100,157 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; - for (std::size_t i = 0; i < std::size(addresses); ++i) { + for (size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; } return addresses; } -void TransitionImages(const std::vector<ImageView>& views, VkPipelineStageFlags pipeline_stage, - VkAccessFlags access) { - for (auto& [view, layout] : views) { - view->Transition(*layout, pipeline_stage, access); +struct TextureHandle { + constexpr TextureHandle(u32 data, bool via_header_index) { + const Tegra::Texture::TextureHandle handle{data}; + image = handle.tic_id; + sampler = via_header_index ? image : handle.tsc_id.Value(); } -} + + u32 image; + u32 sampler; +}; template <typename Engine, typename Entry> -Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - std::size_t stage, std::size_t index = 0) { - const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); +TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry, + size_t stage, size_t index = 0) { + const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage); if constexpr (std::is_same_v<Entry, SamplerEntry>) { if (entry.is_separated) { const u32 buffer_1 = entry.buffer; const u32 buffer_2 = entry.secondary_buffer; const u32 offset_1 = entry.offset; const u32 offset_2 = entry.secondary_offset; - const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); - const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); - return engine.GetTextureInfo(handle_1 | handle_2); + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return TextureHandle(handle_1 | handle_2, via_header_index); } } if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); - } - const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); - const u32 offset = entry.offset + entry_offset; - if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(stage_type, offset); - } else { - return engine.GetTexture(offset); - } -} - -/// @brief Determine if an attachment to be updated has to preserve contents -/// @param is_clear True when a clear is being executed -/// @param regs 3D registers -/// @return True when the contents have to be preserved -bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) { - if (!is_clear) { - return true; - } - // First we have to make sure all clear masks are enabled. - if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B || - !regs.clear_buffers.A) { - return true; - } - // If scissors are disabled, the whole screen is cleared - if (!regs.clear_flags.scissor) { - return false; - } - // Then we have to confirm scissor testing clears the whole image - const std::size_t index = regs.clear_buffers.RT; - const auto& scissor = regs.scissor_test[0]; - return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width || - scissor.max_y < regs.rt[index].height; -} - -/// @brief Determine if an attachment to be updated has to preserve contents -/// @param is_clear True when a clear is being executed -/// @param regs 3D registers -/// @return True when the contents have to be preserved -bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) { - // If we are not clearing, the contents have to be preserved - if (!is_clear) { - return true; - } - // For depth stencil clears we only have to confirm scissor test covers the whole image - if (!regs.clear_flags.scissor) { - return false; + const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return TextureHandle(raw, via_header_index); + } + const u32 buffer = engine.GetBoundBuffer(); + const u64 offset = (entry.offset + index) * sizeof(u32); + return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); +} + +ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { + if (entry.is_buffer) { + return ImageViewType::e2D; + } + switch (entry.type) { + case Tegra::Shader::TextureType::Texture1D: + return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D; + case Tegra::Shader::TextureType::Texture2D: + return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D; + case Tegra::Shader::TextureType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::TextureType::TextureCube: + return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube; + } + UNREACHABLE(); + return ImageViewType::e2D; +} + +ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) { + switch (entry.type) { + case Tegra::Shader::ImageType::Texture1D: + return ImageViewType::e1D; + case Tegra::Shader::ImageType::Texture1DArray: + return ImageViewType::e1DArray; + case Tegra::Shader::ImageType::Texture2D: + return ImageViewType::e2D; + case Tegra::Shader::ImageType::Texture2DArray: + return ImageViewType::e2DArray; + case Tegra::Shader::ImageType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::ImageType::TextureBuffer: + return ImageViewType::Buffer; + } + UNREACHABLE(); + return ImageViewType::e2D; +} + +void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_cache, + VKUpdateDescriptorQueue& update_descriptor_queue, + ImageViewId*& image_view_id_ptr, VkSampler*& sampler_ptr) { + for ([[maybe_unused]] const auto& entry : entries.uniform_texels) { + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + update_descriptor_queue.AddTexelBuffer(image_view.BufferView()); } - // Make sure the clear cover the whole image - const auto& scissor = regs.scissor_test[0]; - return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width || - scissor.max_y < regs.zeta_height; -} - -template <std::size_t N> -std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) { - std::array<VkDeviceSize, N> expanded; - std::copy(strides.begin(), strides.end(), expanded.begin()); - return expanded; -} - -} // Anonymous namespace - -class BufferBindings final { -public: - void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) { - vertex.buffers[vertex.num_buffers] = buffer; - vertex.offsets[vertex.num_buffers] = offset; - vertex.sizes[vertex.num_buffers] = size; - vertex.strides[vertex.num_buffers] = static_cast<u16>(stride); - ++vertex.num_buffers; - } - - void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) { - index.buffer = buffer; - index.offset = offset; - index.type = type; - } - - void Bind(const VKDevice& device, VKScheduler& scheduler) const { - // Use this large switch case to avoid dispatching more memory in the record lambda than - // what we need. It looks horrible, but it's the best we can do on standard C++. - switch (vertex.num_buffers) { - case 0: - return BindStatic<0>(device, scheduler); - case 1: - return BindStatic<1>(device, scheduler); - case 2: - return BindStatic<2>(device, scheduler); - case 3: - return BindStatic<3>(device, scheduler); - case 4: - return BindStatic<4>(device, scheduler); - case 5: - return BindStatic<5>(device, scheduler); - case 6: - return BindStatic<6>(device, scheduler); - case 7: - return BindStatic<7>(device, scheduler); - case 8: - return BindStatic<8>(device, scheduler); - case 9: - return BindStatic<9>(device, scheduler); - case 10: - return BindStatic<10>(device, scheduler); - case 11: - return BindStatic<11>(device, scheduler); - case 12: - return BindStatic<12>(device, scheduler); - case 13: - return BindStatic<13>(device, scheduler); - case 14: - return BindStatic<14>(device, scheduler); - case 15: - return BindStatic<15>(device, scheduler); - case 16: - return BindStatic<16>(device, scheduler); - case 17: - return BindStatic<17>(device, scheduler); - case 18: - return BindStatic<18>(device, scheduler); - case 19: - return BindStatic<19>(device, scheduler); - case 20: - return BindStatic<20>(device, scheduler); - case 21: - return BindStatic<21>(device, scheduler); - case 22: - return BindStatic<22>(device, scheduler); - case 23: - return BindStatic<23>(device, scheduler); - case 24: - return BindStatic<24>(device, scheduler); - case 25: - return BindStatic<25>(device, scheduler); - case 26: - return BindStatic<26>(device, scheduler); - case 27: - return BindStatic<27>(device, scheduler); - case 28: - return BindStatic<28>(device, scheduler); - case 29: - return BindStatic<29>(device, scheduler); - case 30: - return BindStatic<30>(device, scheduler); - case 31: - return BindStatic<31>(device, scheduler); - case 32: - return BindStatic<32>(device, scheduler); - } - UNREACHABLE(); - } - -private: - // Some of these fields are intentionally left uninitialized to avoid initializing them twice. - struct { - std::size_t num_buffers = 0; - std::array<VkBuffer, Maxwell::NumVertexArrays> buffers; - std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets; - std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes; - std::array<u16, Maxwell::NumVertexArrays> strides; - } vertex; - - struct { - VkBuffer buffer = nullptr; - VkDeviceSize offset; - VkIndexType type; - } index; - - template <std::size_t N> - void BindStatic(const VKDevice& device, VKScheduler& scheduler) const { - if (device.IsExtExtendedDynamicStateSupported()) { - if (index.buffer) { - BindStatic<N, true, true>(scheduler); - } else { - BindStatic<N, false, true>(scheduler); - } - } else { - if (index.buffer) { - BindStatic<N, true, false>(scheduler); - } else { - BindStatic<N, false, false>(scheduler); - } + for (const auto& entry : entries.samplers) { + for (size_t i = 0; i < entry.size; ++i) { + const VkSampler sampler = *sampler_ptr++; + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const VkImageView handle = image_view.Handle(ImageViewTypeFromEntry(entry)); + update_descriptor_queue.AddSampledImage(handle, sampler); } } - - template <std::size_t N, bool is_indexed, bool has_extended_dynamic_state> - void BindStatic(VKScheduler& scheduler) const { - static_assert(N <= Maxwell::NumVertexArrays); - if constexpr (N == 0) { - return; - } - - std::array<VkBuffer, N> buffers; - std::array<VkDeviceSize, N> offsets; - std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin()); - std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin()); - - if constexpr (has_extended_dynamic_state) { - // With extended dynamic states we can specify the length and stride of a vertex buffer - std::array<VkDeviceSize, N> sizes; - std::array<u16, N> strides; - std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin()); - std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin()); - - if constexpr (is_indexed) { - scheduler.Record( - [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); - cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(), - offsets.data(), sizes.data(), - ExpandStrides(strides).data()); - }); - } else { - scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) { - cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(), - offsets.data(), sizes.data(), - ExpandStrides(strides).data()); - }); - } - return; - } - - if constexpr (is_indexed) { - // Indexed draw - scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); - cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data()); - }); - } else { - // Array draw - scheduler.Record([buffers, offsets](vk::CommandBuffer cmdbuf) { - cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data()); - }); - } + for ([[maybe_unused]] const auto& entry : entries.storage_texels) { + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + update_descriptor_queue.AddTexelBuffer(image_view.BufferView()); } -}; - -void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const { - if (is_indexed) { - cmdbuf.DrawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance); - } else { - cmdbuf.Draw(num_vertices, num_instances, base_vertex, base_instance); + for (const auto& entry : entries.images) { + // TODO: Mark as modified + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const VkImageView handle = image_view.Handle(ImageViewTypeFromEntry(entry)); + update_descriptor_queue.AddImage(handle); + } +} + +DrawParams MakeDrawParams(const Maxwell& regs, u32 num_instances, bool is_instanced, + bool is_indexed) { + DrawParams params{ + .base_instance = regs.vb_base_instance, + .num_instances = is_instanced ? num_instances : 1, + .base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first, + .num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count, + .is_indexed = is_indexed, + }; + if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { + // 6 triangle vertices per quad, base vertex is part of the index + // See BindQuadArrayIndexBuffer for more details + params.num_vertices = (params.num_vertices / 4) * 6; + params.base_vertex = 0; + params.is_indexed = true; } + return params; } +} // Anonymous namespace -RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer, - VKScreenInfo& screen_info, const VKDevice& device, - VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, StateTracker& state_tracker, - VKScheduler& scheduler) - : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer}, - screen_info{screen_info}, device{device}, resource_manager{resource_manager}, - memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, - staging_pool(device, memory_manager, scheduler), descriptor_pool(device), - update_descriptor_queue(device, scheduler), renderpass_cache(device), - quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, - staging_pool), - pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue, - renderpass_cache), - buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), - sampler_cache(device), - fence_manager(system, *this, device, scheduler, texture_cache, buffer_cache, query_cache), - query_cache(system, *this, device, scheduler), - wfi_event{device.GetLogical().CreateNewEvent()}, async_shaders{renderer} { +RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::MemoryManager& gpu_memory_, + Core::Memory::Memory& cpu_memory_, VKScreenInfo& screen_info_, + const Device& device_, MemoryAllocator& memory_allocator_, + StateTracker& state_tracker_, VKScheduler& scheduler_) + : RasterizerAccelerated{cpu_memory_}, gpu{gpu_}, + gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()}, + screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_}, + state_tracker{state_tracker_}, scheduler{scheduler_}, + staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), + update_descriptor_queue(device, scheduler), + blit_image(device, scheduler, state_tracker, descriptor_pool), + texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, + texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, + update_descriptor_queue, descriptor_pool), + buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), + pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, + descriptor_pool, update_descriptor_queue), + query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, + fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), + wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) { scheduler.SetQueryCache(query_cache); if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); @@ -414,64 +262,57 @@ RasterizerVulkan::~RasterizerVulkan() = default; void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(Vulkan_Drawing); + SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); query_cache.UpdateCounters(); - SCOPE_EXIT({ system.GPU().TickWork(); }); - - const auto& gpu = system.GPU().Maxwell3D(); - GraphicsPipelineCacheKey key; - key.fixed_state.Fill(gpu.regs, device.IsExtExtendedDynamicStateSupported()); - - buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); + graphics_key.fixed_state.Refresh(maxwell3d, device.IsExtExtendedDynamicStateSupported()); - BufferBindings buffer_bindings; - const DrawParameters draw_params = - SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; - update_descriptor_queue.Acquire(); - sampled_views.clear(); - image_views.clear(); + texture_cache.SynchronizeGraphicsDescriptors(); + texture_cache.UpdateRenderTargets(false); const auto shaders = pipeline_cache.GetShaders(); - key.shaders = GetShaderAddresses(shaders); - SetupShaderDescriptors(shaders); - - buffer_cache.Unmap(); + graphics_key.shaders = GetShaderAddresses(shaders); - const Texceptions texceptions = UpdateAttachments(false); - SetupImageTransitions(texceptions, color_attachments, zeta_attachment); + graphics_key.shaders = GetShaderAddresses(shaders); + SetupShaderDescriptors(shaders, is_indexed); - key.renderpass_params = GetRenderPassParams(texceptions); - key.padding = 0; + const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); + graphics_key.renderpass = framebuffer->RenderPass(); - auto* pipeline = pipeline_cache.GetGraphicsPipeline(key, async_shaders); + VKGraphicsPipeline* const pipeline = pipeline_cache.GetGraphicsPipeline( + graphics_key, framebuffer->NumColorBuffers(), async_shaders); if (pipeline == nullptr || pipeline->GetHandle() == VK_NULL_HANDLE) { // Async graphics pipeline was not ready. return; } - scheduler.BindGraphicsPipeline(pipeline->GetHandle()); - - const auto renderpass = pipeline->GetRenderPass(); - const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); - scheduler.RequestRenderpass(renderpass, framebuffer, render_area); + BeginTransformFeedback(); + scheduler.RequestRenderpass(framebuffer); + scheduler.BindGraphicsPipeline(pipeline->GetHandle()); UpdateDynamicStates(); - buffer_bindings.Bind(device, scheduler); - - BeginTransformFeedback(); - - const auto pipeline_layout = pipeline->GetLayout(); - const auto descriptor_set = pipeline->CommitDescriptorSet(); + const auto& regs = maxwell3d.regs; + const u32 num_instances = maxwell3d.mme_draw.instance_count; + const DrawParams draw_params = MakeDrawParams(regs, num_instances, is_instanced, is_indexed); + const VkPipelineLayout pipeline_layout = pipeline->GetLayout(); + const VkDescriptorSet descriptor_set = pipeline->CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { if (descriptor_set) { cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, - DESCRIPTOR_SET, descriptor_set, {}); + DESCRIPTOR_SET, descriptor_set, nullptr); + } + if (draw_params.is_indexed) { + cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, 0, + draw_params.base_vertex, draw_params.base_instance); + } else { + cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, + draw_params.base_vertex, draw_params.base_instance); } - draw_params.Draw(cmdbuf); }); EndTransformFeedback(); @@ -480,17 +321,13 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); - const auto& gpu = system.GPU().Maxwell3D(); - if (!system.GPU().Maxwell3D().ShouldExecute()) { + if (!maxwell3d.ShouldExecute()) { return; } - sampled_views.clear(); - image_views.clear(); - query_cache.UpdateCounters(); - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; const bool use_depth = regs.clear_buffers.Z; @@ -499,20 +336,24 @@ void RasterizerVulkan::Clear() { return; } - [[maybe_unused]] const auto texceptions = UpdateAttachments(true); - DEBUG_ASSERT(texceptions.none()); - SetupImageTransitions(0, color_attachments, zeta_attachment); + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UpdateRenderTargets(true); + const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); + const VkExtent2D render_area = framebuffer->RenderArea(); + scheduler.RequestRenderpass(framebuffer); - const VkRenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0)); - const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); - scheduler.RequestRenderpass(renderpass, framebuffer, render_area); - - VkClearRect clear_rect; - clear_rect.baseArrayLayer = regs.clear_buffers.layer; - clear_rect.layerCount = 1; - clear_rect.rect = GetScissorState(regs, 0); - clear_rect.rect.extent.width = std::min(clear_rect.rect.extent.width, render_area.width); - clear_rect.rect.extent.height = std::min(clear_rect.rect.extent.height, render_area.height); + VkClearRect clear_rect{ + .rect = GetScissorState(regs, 0), + .baseArrayLayer = regs.clear_buffers.layer, + .layerCount = 1, + }; + if (clear_rect.rect.extent.width == 0 || clear_rect.rect.extent.height == 0) { + return; + } + clear_rect.rect.extent = VkExtent2D{ + .width = std::min(clear_rect.rect.extent.width, render_area.width), + .height = std::min(clear_rect.rect.extent.height, render_area.height), + }; if (use_color) { VkClearValue clear_value; @@ -539,7 +380,6 @@ void RasterizerVulkan::Clear() { if (use_stencil) { aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT; } - scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { VkClearAttachment attachment; @@ -553,51 +393,69 @@ void RasterizerVulkan::Clear() { void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { MICROPROFILE_SCOPE(Vulkan_Compute); - update_descriptor_queue.Acquire(); - sampled_views.clear(); - image_views.clear(); query_cache.UpdateCounters(); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; auto& pipeline = pipeline_cache.GetComputePipeline({ .shader = code_addr, .shared_memory_size = launch_desc.shared_alloc, - .workgroup_size = - { - launch_desc.block_dim_x, - launch_desc.block_dim_y, - launch_desc.block_dim_z, - }, + .workgroup_size{ + launch_desc.block_dim_x, + launch_desc.block_dim_y, + launch_desc.block_dim_z, + }, }); // Compute dispatches can't be executed inside a renderpass scheduler.RequestOutsideRenderPassOperationContext(); - buffer_cache.Map(CalculateComputeStreamBufferSize()); + image_view_indices.clear(); + sampler_handles.clear(); + + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; const auto& entries = pipeline.GetEntries(); - SetupComputeConstBuffers(entries); - SetupComputeGlobalBuffers(entries); + buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers); + buffer_cache.UnbindComputeStorageBuffers(); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_buffers) { + buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset, + buffer.is_written); + ++ssbo_index; + } + buffer_cache.UpdateComputeBuffers(); + + texture_cache.SynchronizeComputeDescriptors(); + SetupComputeUniformTexels(entries); SetupComputeTextures(entries); SetupComputeStorageTexels(entries); SetupComputeImages(entries); - buffer_cache.Unmap(); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillComputeImageViews(indices_span, image_view_ids); + + update_descriptor_queue.Acquire(); - TransitionImages(sampled_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT); - TransitionImages(image_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); + buffer_cache.BindHostComputeBuffers(); + ImageViewId* image_view_id_ptr = image_view_ids.data(); + VkSampler* sampler_ptr = sampler_handles.data(); + PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, + sampler_ptr); + + const VkPipeline pipeline_handle = pipeline.GetHandle(); + const VkPipelineLayout pipeline_layout = pipeline.GetLayout(); + const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet(); scheduler.Record([grid_x = launch_desc.grid_dim_x, grid_y = launch_desc.grid_dim_y, - grid_z = launch_desc.grid_dim_z, pipeline_handle = pipeline.GetHandle(), - layout = pipeline.GetLayout(), - descriptor_set = pipeline.CommitDescriptorSet()](vk::CommandBuffer cmdbuf) { + grid_z = launch_desc.grid_dim_z, pipeline_handle, pipeline_layout, + descriptor_set](vk::CommandBuffer cmdbuf) { cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_handle); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, DESCRIPTOR_SET, - descriptor_set, {}); + if (descriptor_set) { + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout, + DESCRIPTOR_SET, descriptor_set, nullptr); + } cmdbuf.Dispatch(grid_x, grid_y, grid_z); }); } @@ -611,31 +469,50 @@ void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, query_cache.Query(gpu_addr, type, timestamp); } +void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size); +} + void RasterizerVulkan::FlushAll() {} void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.FlushRegion(addr, size); - buffer_cache.FlushRegion(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.DownloadMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.DownloadMemory(addr, size); + } query_cache.FlushRegion(addr, size); } bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { + std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex}; if (!Settings::IsGPULevelHigh()) { - return buffer_cache.MustFlushRegion(addr, size); + return buffer_cache.IsRegionGpuModified(addr, size); } - return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); + return texture_cache.IsRegionGpuModified(addr, size) || + buffer_cache.IsRegionGpuModified(addr, size); } void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.InvalidateRegion(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } pipeline_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); } @@ -643,28 +520,46 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.OnCPUWrite(addr, size); pipeline_cache.OnCPUWrite(addr, size); - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.CachedWriteMemory(addr, size); + } } void RasterizerVulkan::SyncGuestHost() { - texture_cache.SyncGuestHost(); - buffer_cache.SyncGuestHost(); pipeline_cache.SyncGuestHost(); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.FlushCachedWrites(); + } +} + +void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapMemory(addr, size); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } + pipeline_cache.OnCPUWrite(addr, size); } void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { - gpu.MemoryManager().Write<u32>(addr, value); + gpu_memory.Write<u32>(addr, value); return; } fence_manager.SignalSemaphore(addr, value); } void RasterizerVulkan::SignalSyncPoint(u32 value) { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { gpu.IncrementSyncPoint(value); return; @@ -673,7 +568,6 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) { } void RasterizerVulkan::ReleaseFences() { - auto& gpu{system.GPU()}; if (!gpu.IsAsync()) { return; } @@ -707,6 +601,14 @@ void RasterizerVulkan::WaitForIdle() { }); } +void RasterizerVulkan::FragmentBarrier() { + // We already put barriers when a render pass finishes +} + +void RasterizerVulkan::TiledCacheBarrier() { + // TODO: Implementing tiled barriers requires rewriting a good chunk of the Vulkan backend +} + void RasterizerVulkan::FlushCommands() { if (draw_counter > 0) { draw_counter = 0; @@ -717,14 +619,23 @@ void RasterizerVulkan::FlushCommands() { void RasterizerVulkan::TickFrame() { draw_counter = 0; update_descriptor_queue.TickFrame(); - buffer_cache.TickFrame(); + fence_manager.TickFrame(); staging_pool.TickFrame(); + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.TickFrame(); + } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.TickFrame(); + } } -bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, +bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { - texture_cache.DoFermiCopy(src, dst, copy_config); + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.BlitImage(dst, src, copy_config); return true; } @@ -733,28 +644,18 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!framebuffer_addr) { return false; } - - const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)}; - if (!surface) { + std::scoped_lock lock{texture_cache.mutex}; + ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr); + if (!image_view) { return false; } - - // Verify that the cached surface is the same size and format as the requested framebuffer - const auto& params{surface->GetSurfaceParams()}; - ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); - ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); - - screen_info.image = &surface->GetImage(); - screen_info.width = params.width; - screen_info.height = params.height; - screen_info.is_srgb = surface->GetSurfaceParams().srgb_conversion; + screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D); + screen_info.width = image_view->size.width; + screen_info.height = image_view->size.height; + screen_info.is_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); return true; } -void RasterizerVulkan::SetupDirtyFlags() { - state_tracker.Initialize(); -} - void RasterizerVulkan::FlushWork() { static constexpr u32 DRAWS_TO_DISPATCH = 4096; @@ -776,178 +677,54 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - auto& maxwell3d = system.GPU().Maxwell3D(); - auto& dirty = maxwell3d.dirty.flags; - auto& regs = maxwell3d.regs; - - const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; - dirty[VideoCommon::Dirty::RenderTargets] = false; - - texture_cache.GuardRenderTargets(true); - - Texceptions texceptions; - for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { - if (update_rendertargets) { - const bool preserve_contents = HasToPreserveColorContents(is_clear, regs); - color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents); - } - if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { - texceptions[rt] = true; - } - } - - if (update_rendertargets) { - const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs); - zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents); - } - if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { - texceptions[ZETA_TEXCEPTION_INDEX] = true; - } - - texture_cache.GuardRenderTargets(false); - - return texceptions; -} - -bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachment) { - bool overlap = false; - for (auto& [view, layout] : sampled_views) { - if (!attachment.IsSameSurface(*view)) { - continue; - } - overlap = true; - *layout = VK_IMAGE_LAYOUT_GENERAL; - } - return overlap; -} - -std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers( - VkRenderPass renderpass) { - FramebufferCacheKey key{ - .renderpass = renderpass, - .width = std::numeric_limits<u32>::max(), - .height = std::numeric_limits<u32>::max(), - .layers = std::numeric_limits<u32>::max(), - .views = {}, - }; - - const auto try_push = [&key](const View& view) { - if (!view) { - return false; - } - key.views.push_back(view->GetAttachment()); - key.width = std::min(key.width, view->GetWidth()); - key.height = std::min(key.height, view->GetHeight()); - key.layers = std::min(key.layers, view->GetNumLayers()); - return true; - }; - - const auto& regs = system.GPU().Maxwell3D().regs; - const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); - for (std::size_t index = 0; index < num_attachments; ++index) { - if (try_push(color_attachments[index])) { - texture_cache.MarkColorBufferInUse(index); - } - } - if (try_push(zeta_attachment)) { - texture_cache.MarkDepthBufferInUse(); - } - - const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); - auto& framebuffer = fbentry->second; - if (is_cache_miss) { - framebuffer = device.GetLogical().CreateFramebuffer({ - .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .renderPass = key.renderpass, - .attachmentCount = static_cast<u32>(key.views.size()), - .pAttachments = key.views.data(), - .width = key.width, - .height = key.height, - .layers = key.layers, - }); - } - - return {*framebuffer, VkExtent2D{key.width, key.height}}; -} - -RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state, - BufferBindings& buffer_bindings, - bool is_indexed, - bool is_instanced) { - MICROPROFILE_SCOPE(Vulkan_Geometry); - - const auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - SetupVertexArrays(buffer_bindings); - - const u32 base_instance = regs.vb_base_instance; - const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1; - const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first; - const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count; - - DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed}; - SetupIndexBuffer(buffer_bindings, params, is_indexed); - - return params; -} - void RasterizerVulkan::SetupShaderDescriptors( - const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { - texture_cache.GuardSamplers(true); - - for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { - // Skip VertexA stage + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, bool is_indexed) { + image_view_indices.clear(); + sampler_handles.clear(); + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } - const auto& entries = shader->GetEntries(); - SetupGraphicsConstBuffers(entries, stage); - SetupGraphicsGlobalBuffers(entries, stage); + const ShaderEntries& entries = shader->GetEntries(); SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); + + buffer_cache.SetEnabledUniformBuffers(stage, entries.enabled_uniform_buffers); + buffer_cache.UnbindGraphicsStorageBuffers(stage); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_buffers) { + buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index, + buffer.cbuf_offset, buffer.is_written); + ++ssbo_index; + } } - texture_cache.GuardSamplers(false); -} + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + buffer_cache.UpdateGraphicsBuffers(is_indexed); + texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + + buffer_cache.BindHostGeometryBuffers(is_indexed); -void RasterizerVulkan::SetupImageTransitions( - Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, - const View& zeta_attachment) { - TransitionImages(sampled_views, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, VK_ACCESS_SHADER_READ_BIT); - TransitionImages(image_views, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); + update_descriptor_queue.Acquire(); - for (std::size_t rt = 0; rt < std::size(color_attachments); ++rt) { - const auto color_attachment = color_attachments[rt]; - if (color_attachment == nullptr) { + ImageViewId* image_view_id_ptr = image_view_ids.data(); + VkSampler* sampler_ptr = sampler_handles.data(); + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { + // Skip VertexA stage + Shader* const shader = shaders[stage + 1]; + if (!shader) { continue; } - const auto image_layout = - texceptions[rt] ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - color_attachment->Transition(image_layout, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT); - } - - if (zeta_attachment != nullptr) { - const auto image_layout = texceptions[ZETA_TEXCEPTION_INDEX] - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - zeta_attachment->Transition(image_layout, VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT); + buffer_cache.BindHostStageBuffers(stage); + PushImageDescriptors(shader->GetEntries(), texture_cache, update_descriptor_queue, + image_view_id_ptr, sampler_ptr); } } void RasterizerVulkan::UpdateDynamicStates() { - auto& regs = system.GPU().Maxwell3D().regs; + auto& regs = maxwell3d.regs; UpdateViewportsState(regs); UpdateScissorsState(regs); UpdateDepthBias(regs); @@ -961,14 +738,13 @@ void RasterizerVulkan::UpdateDynamicStates() { UpdateDepthWriteEnable(regs); UpdateDepthCompareOp(regs); UpdateFrontFace(regs); - UpdatePrimitiveTopology(regs); UpdateStencilOp(regs); UpdateStencilTestEnable(regs); } } void RasterizerVulkan::BeginTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } @@ -976,322 +752,108 @@ void RasterizerVulkan::BeginTransformFeedback() { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - - UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); - UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable); - UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable); - - const auto& binding = regs.tfb_bindings[0]; - UNIMPLEMENTED_IF(binding.buffer_enable == 0); - UNIMPLEMENTED_IF(binding.buffer_offset != 0); - - const GPUVAddr gpu_addr = binding.Address(); - const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size); - const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - - scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) { - cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size); - cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); - }); + scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::EndTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } if (!device.IsExtTransformFeedbackSupported()) { return; } - scheduler.Record( [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } -void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) { - const auto& regs = system.GPU().Maxwell3D().regs; - - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) { - continue; - } - const GPUVAddr start{vertex_array.StartAddress()}; - const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - - ASSERT(end >= start); - const std::size_t size = end - start; - if (size == 0) { - buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0); - continue; - } - const auto info = buffer_cache.UploadMemory(start, size); - buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride); - } -} - -void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, - bool is_indexed) { - if (params.num_vertices == 0) { - return; - } - const auto& regs = system.GPU().Maxwell3D().regs; - switch (regs.draw.topology) { - case Maxwell::PrimitiveTopology::Quads: { - if (!params.is_indexed) { - const auto [buffer, offset] = - quad_array_pass.Assemble(params.num_vertices, params.base_vertex); - buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); - params.base_vertex = 0; - params.num_vertices = params.num_vertices * 6 / 4; - params.is_indexed = true; - break; - } - const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); - VkBuffer buffer = info.handle; - u64 offset = info.offset; - std::tie(buffer, offset) = quad_indexed_pass.Assemble( - regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); - - buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); - params.num_vertices = (params.num_vertices / 4) * 6; - params.base_vertex = 0; - break; - } - default: { - if (!is_indexed) { - break; - } - const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); - VkBuffer buffer = info.handle; - u64 offset = info.offset; - - auto format = regs.index_array.format; - const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; - if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) { - std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset); - format = Maxwell::IndexFormat::UnsignedShort; - } - - buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format)); - break; - } - } -} - -void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& gpu = system.GPU().Maxwell3D(); - const auto& shader_stage = gpu.state.shader_stages[stage]; - for (const auto& entry : entries.const_buffers) { - SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]); - } -} - -void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - auto& gpu{system.GPU()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage]}; - - for (const auto& entry : entries.global_buffers) { - const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset(); - SetupGlobalBuffer(entry, addr); - } -} - -void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().Maxwell3D(); +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) { + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.uniform_texels) { - const auto image = GetTextureInfo(gpu, entry, stage).tic; - SetupUniformTexels(image, entry); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } -void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().Maxwell3D(); +void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) { + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(gpu, entry, stage, i); - SetupTexture(texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const TextureHandle handle = + GetTextureInfo(maxwell3d, via_header_index, entry, stage, index); + image_view_indices.push_back(handle.image); + + Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); } } } -void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().Maxwell3D(); +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) { + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.storage_texels) { - const auto image = GetTextureInfo(gpu, entry, stage).tic; - SetupStorageTexel(image, entry); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } -void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().Maxwell3D(); +void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) { + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(gpu, entry, stage).tic; - SetupImage(tic, entry); - } -} - -void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - for (const auto& entry : entries.const_buffers) { - const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; - const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - Tegra::Engines::ConstBufferInfo buffer; - buffer.address = config.Address(); - buffer.size = config.size; - buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(entry, buffer); - } -} - -void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - const auto cbufs{system.GPU().KeplerCompute().launch_description.const_buffer_config}; - for (const auto& entry : entries.global_buffers) { - const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; - SetupGlobalBuffer(entry, addr); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().KeplerCompute(); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.uniform_texels) { - const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupUniformTexels(image, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().KeplerCompute(); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i); - SetupTexture(texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const TextureHandle handle = GetTextureInfo(kepler_compute, via_header_index, entry, + COMPUTE_SHADER_INDEX, index); + image_view_indices.push_back(handle.image); + + Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); } } } void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().KeplerCompute(); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.storage_texels) { - const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupStorageTexel(image, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().KeplerCompute(); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupImage(tic, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } -void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer) { - if (!buffer.enabled) { - // Set values to zero to unbind buffers - update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE); - return; - } - - // Align the size to avoid bad std140 interactions - const std::size_t size = - Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); - ASSERT(size <= MaxConstbufferSize); - - const auto info = - buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); - update_descriptor_queue.AddBuffer(info.handle, info.offset, size); -} - -void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { - auto& memory_manager{system.GPU().MemoryManager()}; - const auto actual_addr = memory_manager.Read<u64>(address); - const auto size = memory_manager.Read<u32>(address + 8); - - if (size == 0) { - // Sometimes global memory pointers don't have a proper size. Upload a dummy entry - // because Vulkan doesn't like empty buffers. - // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the - // default buffer. - static constexpr std::size_t dummy_size = 4; - const auto info = buffer_cache.GetEmptyBuffer(dummy_size); - update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); - return; - } - - const auto info = buffer_cache.UploadMemory( - actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); - update_descriptor_queue.AddBuffer(info.handle, info.offset, size); -} - -void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, - const UniformTexelEntry& entry) { - const auto view = texture_cache.GetTextureSurface(tic, entry); - ASSERT(view->IsBufferView()); - - update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); -} - -void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry) { - auto view = texture_cache.GetTextureSurface(texture.tic, entry); - ASSERT(!view->IsBufferView()); - - const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); - const auto sampler = sampler_cache.GetSampler(texture.tsc); - update_descriptor_queue.AddSampledImage(sampler, image_view); - - VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); - *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - sampled_views.push_back(ImageView{std::move(view), image_layout}); -} - -void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, - const StorageTexelEntry& entry) { - const auto view = texture_cache.GetImageSurface(tic, entry); - ASSERT(view->IsBufferView()); - - update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); -} - -void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { - auto view = texture_cache.GetImageSurface(tic, entry); - - if (entry.is_written) { - view->MarkAsModified(texture_cache.Tick()); - } - - UNIMPLEMENTED_IF(tic.IsBuffer()); - - const VkImageView image_view = - view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - update_descriptor_queue.AddImage(image_view); - - VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); - *image_layout = VK_IMAGE_LAYOUT_GENERAL; - image_views.push_back(ImageView{std::move(view), image_layout}); -} - void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { if (!state_tracker.TouchViewports()) { return; @@ -1304,7 +866,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), - GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)}; + GetViewportState(device, regs, 14), GetViewportState(device, regs, 15), + }; scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); } @@ -1312,13 +875,14 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs if (!state_tracker.TouchScissors()) { return; } - const std::array scissors = { + const std::array scissors{ GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), - GetScissorState(regs, 15)}; + GetScissorState(regs, 15), + }; scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); } @@ -1442,16 +1006,6 @@ void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) { [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); }); } -void RasterizerVulkan::UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs) { - const Maxwell::PrimitiveTopology primitive_topology = regs.draw.topology.Value(); - if (!state_tracker.ChangePrimitiveTopology(primitive_topology)) { - return; - } - scheduler.Record([this, primitive_topology](vk::CommandBuffer cmdbuf) { - cmdbuf.SetPrimitiveTopologyEXT(MaxwellToVK::PrimitiveTopology(device, primitive_topology)); - }); -} - void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) { if (!state_tracker.TouchStencilOp()) { return; @@ -1493,101 +1047,4 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& }); } -std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { - std::size_t size = CalculateVertexArraysSize(); - if (is_indexed) { - size = Common::AlignUp(size, 4) + CalculateIndexBufferSize(); - } - size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); - return size; -} - -std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { - return Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); -} - -std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - - std::size_t size = 0; - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - // This implementation assumes that all attributes are used in the shader. - const GPUVAddr start{regs.vertex_array[index].StartAddress()}; - const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end >= start); - - size += (end - start) * regs.vertex_array[index].enable; - } - return size; -} - -std::size_t RasterizerVulkan::CalculateIndexBufferSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - return static_cast<std::size_t>(regs.index_array.count) * - static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); -} - -std::size_t RasterizerVulkan::CalculateConstBufferSize( - const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const { - if (entry.IsIndirect()) { - // Buffer is accessed indirectly, so upload the entire thing - return buffer.size; - } else { - // Buffer is accessed directly, upload just what we use - return entry.GetSize(); - } -} - -RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const { - const auto& regs = system.GPU().Maxwell3D().regs; - const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); - - RenderPassParams params; - params.color_formats = {}; - std::size_t color_texceptions = 0; - - std::size_t index = 0; - for (std::size_t rt = 0; rt < num_attachments; ++rt) { - const auto& rendertarget = regs.rt[rt]; - if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) { - continue; - } - params.color_formats[index] = static_cast<u8>(rendertarget.format); - color_texceptions |= (texceptions[rt] ? 1ULL : 0ULL) << index; - ++index; - } - params.num_color_attachments = static_cast<u8>(index); - params.texceptions = static_cast<u8>(color_texceptions); - - params.zeta_format = regs.zeta_enable ? static_cast<u8>(regs.zeta.format) : 0; - params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX]; - return params; -} - -VkBuffer RasterizerVulkan::DefaultBuffer() { - if (default_buffer) { - return *default_buffer; - } - - default_buffer = device.GetLogical().CreateBuffer({ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = DEFAULT_BUFFER_SIZE, - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }); - default_buffer_commit = memory_manager.Commit(default_buffer, false); - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0); - }); - return *default_buffer; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index f640ba649..acea1ba2d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -11,28 +11,25 @@ #include <vector> #include <boost/container/static_vector.hpp> -#include <boost/functional/hash.hpp> #include "common/common_types.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/async_shaders.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -50,66 +47,15 @@ namespace Vulkan { struct VKScreenInfo; -using ImageViewsPack = boost::container::static_vector<VkImageView, Maxwell::NumRenderTargets + 1>; - -struct FramebufferCacheKey { - VkRenderPass renderpass{}; - u32 width = 0; - u32 height = 0; - u32 layers = 0; - ImageViewsPack views; - - std::size_t Hash() const noexcept { - std::size_t hash = 0; - boost::hash_combine(hash, static_cast<VkRenderPass>(renderpass)); - for (const auto& view : views) { - boost::hash_combine(hash, static_cast<VkImageView>(view)); - } - boost::hash_combine(hash, width); - boost::hash_combine(hash, height); - boost::hash_combine(hash, layers); - return hash; - } - - bool operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(renderpass, views, width, height, layers) == - std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers); - } - - bool operator!=(const FramebufferCacheKey& rhs) const noexcept { - return !operator==(rhs); - } -}; - -} // namespace Vulkan - -namespace std { - -template <> -struct hash<Vulkan::FramebufferCacheKey> { - std::size_t operator()(const Vulkan::FramebufferCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace Vulkan { - class StateTracker; -class BufferBindings; - -struct ImageView { - View view; - VkImageLayout* layout = nullptr; -}; class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, - VKScreenInfo& screen_info, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - StateTracker& state_tracker, VKScheduler& scheduler); + explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + VKScreenInfo& screen_info_, const Device& device_, + MemoryAllocator& memory_allocator_, StateTracker& state_tracker_, + VKScheduler& scheduler_); ~RasterizerVulkan() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -117,25 +63,28 @@ public: void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; + void UnmapMemory(VAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void WaitForIdle() override; + void FragmentBarrier() override; + void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void SetupDirtyFlags() override; VideoCommon::Shader::AsyncShaders& GetAsyncShaders() { return async_shaders; @@ -146,45 +95,22 @@ public: } /// Maximum supported size that a constbuffer can have in bytes. - static constexpr std::size_t MaxConstbufferSize = 0x10000; + static constexpr size_t MaxConstbufferSize = 0x10000; static_assert(MaxConstbufferSize % (4 * sizeof(float)) == 0, "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); private: - struct DrawParameters { - void Draw(vk::CommandBuffer cmdbuf) const; + static constexpr size_t MAX_TEXTURES = 192; + static constexpr size_t MAX_IMAGES = 48; + static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES; - u32 base_instance = 0; - u32 num_instances = 0; - u32 base_vertex = 0; - u32 num_vertices = 0; - bool is_indexed = 0; - }; - - using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>; - - static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); void FlushWork(); - /// @brief Updates the currently bound attachments - /// @param is_clear True when the framebuffer is updated as a clear - /// @return Bitfield of attachments being used as sampled textures - Texceptions UpdateAttachments(bool is_clear); - - std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass); - - /// Setups geometry buffers and state. - DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings, - bool is_indexed, bool is_instanced); - /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); - - void SetupImageTransitions(Texceptions texceptions, - const std::array<View, Maxwell::NumRenderTargets>& color_attachments, - const View& zeta_attachment); + void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, + bool is_indexed); void UpdateDynamicStates(); @@ -192,18 +118,6 @@ private: void EndTransformFeedback(); - bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); - - void SetupVertexArrays(BufferBindings& buffer_bindings); - - void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed); - - /// Setup constant buffers in the graphics pipeline. - void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage); - - /// Setup global buffers in the graphics pipeline. - void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); - /// Setup uniform texels in the graphics pipeline. void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); @@ -216,12 +130,6 @@ private: /// Setup images in the graphics pipeline. void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); - /// Setup constant buffers in the compute pipeline. - void SetupComputeConstBuffers(const ShaderEntries& entries); - - /// Setup global buffers in the compute pipeline. - void SetupComputeGlobalBuffers(const ShaderEntries& entries); - /// Setup texel buffers in the compute pipeline. void SetupComputeUniformTexels(const ShaderEntries& entries); @@ -234,19 +142,6 @@ private: /// Setup images in the compute pipeline. void SetupComputeImages(const ShaderEntries& entries); - void SetupConstBuffer(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer); - - void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - - void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry); - - void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); - - void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); - - void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); - void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); @@ -260,64 +155,43 @@ private: void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs); - void UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); - std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; - - std::size_t CalculateComputeStreamBufferSize() const; - - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; - - std::size_t CalculateConstBufferSize(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer) const; - - RenderPassParams GetRenderPassParams(Texceptions texceptions) const; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; - VkBuffer DefaultBuffer(); - - Core::System& system; - Core::Frontend::EmuWindow& render_window; VKScreenInfo& screen_info; - const VKDevice& device; - VKResourceManager& resource_manager; - VKMemoryManager& memory_manager; + const Device& device; + MemoryAllocator& memory_allocator; StateTracker& state_tracker; VKScheduler& scheduler; - VKStagingBufferPool staging_pool; + StagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; - VKRenderPassCache renderpass_cache; - QuadArrayPass quad_array_pass; - QuadIndexedPass quad_indexed_pass; - Uint8Pass uint8_pass; + BlitImageHelper blit_image; + + GraphicsPipelineCacheKey graphics_key; - VKTextureCache texture_cache; + TextureCacheRuntime texture_cache_runtime; + TextureCache texture_cache; + BufferCacheRuntime buffer_cache_runtime; + BufferCache buffer_cache; VKPipelineCache pipeline_cache; - VKBufferCache buffer_cache; - VKSamplerCache sampler_cache; - VKFenceManager fence_manager; VKQueryCache query_cache; + VKFenceManager fence_manager; - vk::Buffer default_buffer; - VKMemoryCommit default_buffer_commit; vk::Event wfi_event; VideoCommon::Shader::AsyncShaders async_shaders; - std::array<View, Maxwell::NumRenderTargets> color_attachments; - View zeta_attachment; - - std::vector<ImageView> sampled_views; - std::vector<ImageView> image_views; + boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; + std::array<VideoCommon::ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; + boost::container::static_vector<VkSampler, MAX_TEXTURES> sampler_handles; u32 draw_counter = 0; - - // TODO(Rodrigo): Invalidate on image destruction - std::unordered_map<FramebufferCacheKey, vk::Framebuffer> framebuffer_cache; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp deleted file mode 100644 index 80284cf92..000000000 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <cstring> -#include <memory> -#include <vector> - -#include "common/cityhash.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -std::size_t RenderPassParams::Hash() const noexcept { - const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this); - return static_cast<std::size_t>(hash); -} - -bool RenderPassParams::operator==(const RenderPassParams& rhs) const noexcept { - return std::memcmp(&rhs, this, sizeof *this) == 0; -} - -VKRenderPassCache::VKRenderPassCache(const VKDevice& device) : device{device} {} - -VKRenderPassCache::~VKRenderPassCache() = default; - -VkRenderPass VKRenderPassCache::GetRenderPass(const RenderPassParams& params) { - const auto [pair, is_cache_miss] = cache.try_emplace(params); - auto& entry = pair->second; - if (is_cache_miss) { - entry = CreateRenderPass(params); - } - return *entry; -} - -vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& params) const { - using namespace VideoCore::Surface; - const std::size_t num_attachments = static_cast<std::size_t>(params.num_color_attachments); - - std::vector<VkAttachmentDescription> descriptors; - descriptors.reserve(num_attachments); - - std::vector<VkAttachmentReference> color_references; - color_references.reserve(num_attachments); - - for (std::size_t rt = 0; rt < num_attachments; ++rt) { - const auto guest_format = static_cast<Tegra::RenderTargetFormat>(params.color_formats[rt]); - const PixelFormat pixel_format = PixelFormatFromRenderTargetFormat(guest_format); - const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); - ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<int>(pixel_format)); - - // TODO(Rodrigo): Add MAY_ALIAS_BIT when it's needed. - const VkImageLayout color_layout = ((params.texceptions >> rt) & 1) != 0 - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - descriptors.push_back({ - .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT, - .format = format.format, - .samples = VK_SAMPLE_COUNT_1_BIT, - .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .storeOp = VK_ATTACHMENT_STORE_OP_STORE, - .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, - .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, - .initialLayout = color_layout, - .finalLayout = color_layout, - }); - - color_references.push_back({ - .attachment = static_cast<u32>(rt), - .layout = color_layout, - }); - } - - VkAttachmentReference zeta_attachment_ref; - const bool has_zeta = params.zeta_format != 0; - if (has_zeta) { - const auto guest_format = static_cast<Tegra::DepthFormat>(params.zeta_format); - const PixelFormat pixel_format = PixelFormatFromDepthFormat(guest_format); - const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); - ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<int>(pixel_format)); - - const VkImageLayout zeta_layout = params.zeta_texception != 0 - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - descriptors.push_back({ - .flags = 0, - .format = format.format, - .samples = VK_SAMPLE_COUNT_1_BIT, - .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .storeOp = VK_ATTACHMENT_STORE_OP_STORE, - .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, - .initialLayout = zeta_layout, - .finalLayout = zeta_layout, - }); - - zeta_attachment_ref = { - .attachment = static_cast<u32>(num_attachments), - .layout = zeta_layout, - }; - } - - const VkSubpassDescription subpass_description{ - .flags = 0, - .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, - .inputAttachmentCount = 0, - .pInputAttachments = nullptr, - .colorAttachmentCount = static_cast<u32>(color_references.size()), - .pColorAttachments = color_references.data(), - .pResolveAttachments = nullptr, - .pDepthStencilAttachment = has_zeta ? &zeta_attachment_ref : nullptr, - .preserveAttachmentCount = 0, - .pPreserveAttachments = nullptr, - }; - - VkAccessFlags access = 0; - VkPipelineStageFlags stage = 0; - if (!color_references.empty()) { - access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - } - - if (has_zeta) { - access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - stage |= VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; - } - - const VkSubpassDependency subpass_dependency{ - .srcSubpass = VK_SUBPASS_EXTERNAL, - .dstSubpass = 0, - .srcStageMask = stage, - .dstStageMask = stage, - .srcAccessMask = 0, - .dstAccessMask = access, - .dependencyFlags = 0, - }; - - return device.GetLogical().CreateRenderPass({ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .attachmentCount = static_cast<u32>(descriptors.size()), - .pAttachments = descriptors.data(), - .subpassCount = 1, - .pSubpasses = &subpass_description, - .dependencyCount = 1, - .pDependencies = &subpass_dependency, - }); -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.h b/src/video_core/renderer_vulkan/vk_renderpass_cache.h deleted file mode 100644 index 8b0fec720..000000000 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> -#include <unordered_map> - -#include <boost/container/static_vector.hpp> -#include <boost/functional/hash.hpp> - -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/surface.h" - -namespace Vulkan { - -class VKDevice; - -struct RenderPassParams { - std::array<u8, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> color_formats; - u8 num_color_attachments; - u8 texceptions; - - u8 zeta_format; - u8 zeta_texception; - - std::size_t Hash() const noexcept; - - bool operator==(const RenderPassParams& rhs) const noexcept; - - bool operator!=(const RenderPassParams& rhs) const noexcept { - return !operator==(rhs); - } -}; -static_assert(std::has_unique_object_representations_v<RenderPassParams>); -static_assert(std::is_trivially_copyable_v<RenderPassParams>); -static_assert(std::is_trivially_constructible_v<RenderPassParams>); - -} // namespace Vulkan - -namespace std { - -template <> -struct hash<Vulkan::RenderPassParams> { - std::size_t operator()(const Vulkan::RenderPassParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace Vulkan { - -class VKRenderPassCache final { -public: - explicit VKRenderPassCache(const VKDevice& device); - ~VKRenderPassCache(); - - VkRenderPass GetRenderPass(const RenderPassParams& params); - -private: - vk::RenderPass CreateRenderPass(const RenderPassParams& params) const; - - const VKDevice& device; - std::unordered_map<RenderPassParams, vk::RenderPass> cache; -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp deleted file mode 100644 index f19330a36..000000000 --- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <optional> -#include "common/assert.h" -#include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -namespace { - -// TODO(Rodrigo): Fine tune these numbers. -constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 0x1000; -constexpr std::size_t FENCES_GROW_STEP = 0x40; - -constexpr VkFenceCreateInfo BuildFenceCreateInfo() { - return { - .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - }; -} - -} // Anonymous namespace - -class CommandBufferPool final : public VKFencedPool { -public: - explicit CommandBufferPool(const VKDevice& device) - : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {} - - void Allocate(std::size_t begin, std::size_t end) override { - // Command buffers are going to be commited, recorded, executed every single usage cycle. - // They are also going to be reseted when commited. - Pool& pool = pools.emplace_back(); - pool.handle = device.GetLogical().CreateCommandPool({ - .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | - VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, - .queueFamilyIndex = device.GetGraphicsFamily(), - }); - pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE); - } - - VkCommandBuffer Commit(VKFence& fence) { - const std::size_t index = CommitResource(fence); - const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE; - const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE; - return pools[pool_index].cmdbufs[sub_index]; - } - -private: - struct Pool { - vk::CommandPool handle; - vk::CommandBuffers cmdbufs; - }; - - const VKDevice& device; - std::vector<Pool> pools; -}; - -VKResource::VKResource() = default; - -VKResource::~VKResource() = default; - -VKFence::VKFence(const VKDevice& device) - : device{device}, handle{device.GetLogical().CreateFence(BuildFenceCreateInfo())} {} - -VKFence::~VKFence() = default; - -void VKFence::Wait() { - switch (const VkResult result = handle.Wait()) { - case VK_SUCCESS: - return; - case VK_ERROR_DEVICE_LOST: - device.ReportLoss(); - [[fallthrough]]; - default: - throw vk::Exception(result); - } -} - -void VKFence::Release() { - ASSERT(is_owned); - is_owned = false; -} - -void VKFence::Commit() { - is_owned = true; - is_used = true; -} - -bool VKFence::Tick(bool gpu_wait, bool owner_wait) { - if (!is_used) { - // If a fence is not used it's always free. - return true; - } - if (is_owned && !owner_wait) { - // The fence is still being owned (Release has not been called) and ownership wait has - // not been asked. - return false; - } - - if (gpu_wait) { - // Wait for the fence if it has been requested. - (void)handle.Wait(); - } else { - if (handle.GetStatus() != VK_SUCCESS) { - // Vulkan fence is not ready, not much it can do here - return false; - } - } - - // Broadcast resources their free state. - for (auto* resource : protected_resources) { - resource->OnFenceRemoval(this); - } - protected_resources.clear(); - - // Prepare fence for reusage. - handle.Reset(); - is_used = false; - return true; -} - -void VKFence::Protect(VKResource* resource) { - protected_resources.push_back(resource); -} - -void VKFence::Unprotect(VKResource* resource) { - const auto it = std::find(protected_resources.begin(), protected_resources.end(), resource); - ASSERT(it != protected_resources.end()); - - resource->OnFenceRemoval(this); - protected_resources.erase(it); -} - -void VKFence::RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept { - std::replace(std::begin(protected_resources), std::end(protected_resources), old_resource, - new_resource); -} - -VKFenceWatch::VKFenceWatch() = default; - -VKFenceWatch::VKFenceWatch(VKFence& initial_fence) { - Watch(initial_fence); -} - -VKFenceWatch::VKFenceWatch(VKFenceWatch&& rhs) noexcept { - fence = std::exchange(rhs.fence, nullptr); - if (fence) { - fence->RedirectProtection(&rhs, this); - } -} - -VKFenceWatch& VKFenceWatch::operator=(VKFenceWatch&& rhs) noexcept { - fence = std::exchange(rhs.fence, nullptr); - if (fence) { - fence->RedirectProtection(&rhs, this); - } - return *this; -} - -VKFenceWatch::~VKFenceWatch() { - if (fence) { - fence->Unprotect(this); - } -} - -void VKFenceWatch::Wait() { - if (fence == nullptr) { - return; - } - fence->Wait(); - fence->Unprotect(this); -} - -void VKFenceWatch::Watch(VKFence& new_fence) { - Wait(); - fence = &new_fence; - fence->Protect(this); -} - -bool VKFenceWatch::TryWatch(VKFence& new_fence) { - if (fence) { - return false; - } - fence = &new_fence; - fence->Protect(this); - return true; -} - -void VKFenceWatch::OnFenceRemoval(VKFence* signaling_fence) { - ASSERT_MSG(signaling_fence == fence, "Removing the wrong fence"); - fence = nullptr; -} - -VKFencedPool::VKFencedPool(std::size_t grow_step) : grow_step{grow_step} {} - -VKFencedPool::~VKFencedPool() = default; - -std::size_t VKFencedPool::CommitResource(VKFence& fence) { - const auto Search = [&](std::size_t begin, std::size_t end) -> std::optional<std::size_t> { - for (std::size_t iterator = begin; iterator < end; ++iterator) { - if (watches[iterator]->TryWatch(fence)) { - // The resource is now being watched, a free resource was successfully found. - return iterator; - } - } - return {}; - }; - // Try to find a free resource from the hinted position to the end. - auto found = Search(free_iterator, watches.size()); - if (!found) { - // Search from beginning to the hinted position. - found = Search(0, free_iterator); - if (!found) { - // Both searches failed, the pool is full; handle it. - const std::size_t free_resource = ManageOverflow(); - - // Watch will wait for the resource to be free. - watches[free_resource]->Watch(fence); - found = free_resource; - } - } - // Free iterator is hinted to the resource after the one that's been commited. - free_iterator = (*found + 1) % watches.size(); - return *found; -} - -std::size_t VKFencedPool::ManageOverflow() { - const std::size_t old_capacity = watches.size(); - Grow(); - - // The last entry is guaranted to be free, since it's the first element of the freshly - // allocated resources. - return old_capacity; -} - -void VKFencedPool::Grow() { - const std::size_t old_capacity = watches.size(); - watches.resize(old_capacity + grow_step); - std::generate(watches.begin() + old_capacity, watches.end(), - []() { return std::make_unique<VKFenceWatch>(); }); - Allocate(old_capacity, old_capacity + grow_step); -} - -VKResourceManager::VKResourceManager(const VKDevice& device) : device{device} { - GrowFences(FENCES_GROW_STEP); - command_buffer_pool = std::make_unique<CommandBufferPool>(device); -} - -VKResourceManager::~VKResourceManager() = default; - -VKFence& VKResourceManager::CommitFence() { - const auto StepFences = [&](bool gpu_wait, bool owner_wait) -> VKFence* { - const auto Tick = [=](auto& fence) { return fence->Tick(gpu_wait, owner_wait); }; - const auto hinted = fences.begin() + fences_iterator; - - auto it = std::find_if(hinted, fences.end(), Tick); - if (it == fences.end()) { - it = std::find_if(fences.begin(), hinted, Tick); - if (it == hinted) { - return nullptr; - } - } - fences_iterator = std::distance(fences.begin(), it) + 1; - if (fences_iterator >= fences.size()) - fences_iterator = 0; - - auto& fence = *it; - fence->Commit(); - return fence.get(); - }; - - VKFence* found_fence = StepFences(false, false); - if (!found_fence) { - // Try again, this time waiting. - found_fence = StepFences(true, false); - - if (!found_fence) { - // Allocate new fences and try again. - LOG_INFO(Render_Vulkan, "Allocating new fences {} -> {}", fences.size(), - fences.size() + FENCES_GROW_STEP); - - GrowFences(FENCES_GROW_STEP); - found_fence = StepFences(true, false); - ASSERT(found_fence != nullptr); - } - } - return *found_fence; -} - -VkCommandBuffer VKResourceManager::CommitCommandBuffer(VKFence& fence) { - return command_buffer_pool->Commit(fence); -} - -void VKResourceManager::GrowFences(std::size_t new_fences_count) { - const std::size_t previous_size = fences.size(); - fences.resize(previous_size + new_fences_count); - - std::generate(fences.begin() + previous_size, fences.end(), - [this] { return std::make_unique<VKFence>(device); }); -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h deleted file mode 100644 index f683d2276..000000000 --- a/src/video_core/renderer_vulkan/vk_resource_manager.h +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <cstddef> -#include <memory> -#include <vector> -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -class VKDevice; -class VKFence; -class VKResourceManager; - -class CommandBufferPool; - -/// Interface for a Vulkan resource -class VKResource { -public: - explicit VKResource(); - virtual ~VKResource(); - - /** - * Signals the object that an owning fence has been signaled. - * @param signaling_fence Fence that signals its usage end. - */ - virtual void OnFenceRemoval(VKFence* signaling_fence) = 0; -}; - -/** - * Fences take ownership of objects, protecting them from GPU-side or driver-side concurrent access. - * They must be commited from the resource manager. Their usage flow is: commit the fence from the - * resource manager, protect resources with it and use them, send the fence to an execution queue - * and Wait for it if needed and then call Release. Used resources will automatically be signaled - * when they are free to be reused. - * @brief Protects resources for concurrent usage and signals its release. - */ -class VKFence { - friend class VKResourceManager; - -public: - explicit VKFence(const VKDevice& device); - ~VKFence(); - - /** - * Waits for the fence to be signaled. - * @warning You must have ownership of the fence and it has to be previously sent to a queue to - * call this function. - */ - void Wait(); - - /** - * Releases ownership of the fence. Pass after it has been sent to an execution queue. - * Unmanaged usage of the fence after the call will result in undefined behavior because it may - * be being used for something else. - */ - void Release(); - - /// Protects a resource with this fence. - void Protect(VKResource* resource); - - /// Removes protection for a resource. - void Unprotect(VKResource* resource); - - /// Redirects one protected resource to a new address. - void RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept; - - /// Retreives the fence. - operator VkFence() const { - return *handle; - } - -private: - /// Take ownership of the fence. - void Commit(); - - /** - * Updates the fence status. - * @warning Waiting for the owner might soft lock the execution. - * @param gpu_wait Wait for the fence to be signaled by the driver. - * @param owner_wait Wait for the owner to signal its freedom. - * @returns True if the fence is free. Waiting for gpu and owner will always return true. - */ - bool Tick(bool gpu_wait, bool owner_wait); - - const VKDevice& device; ///< Device handler - vk::Fence handle; ///< Vulkan fence - std::vector<VKResource*> protected_resources; ///< List of resources protected by this fence - bool is_owned = false; ///< The fence has been commited but not released yet. - bool is_used = false; ///< The fence has been commited but it has not been checked to be free. -}; - -/** - * A fence watch is used to keep track of the usage of a fence and protect a resource or set of - * resources without having to inherit VKResource from their handlers. - */ -class VKFenceWatch final : public VKResource { -public: - explicit VKFenceWatch(); - VKFenceWatch(VKFence& initial_fence); - VKFenceWatch(VKFenceWatch&&) noexcept; - VKFenceWatch(const VKFenceWatch&) = delete; - ~VKFenceWatch() override; - - VKFenceWatch& operator=(VKFenceWatch&&) noexcept; - - /// Waits for the fence to be released. - void Wait(); - - /** - * Waits for a previous fence and watches a new one. - * @param new_fence New fence to wait to. - */ - void Watch(VKFence& new_fence); - - /** - * Checks if it's currently being watched and starts watching it if it's available. - * @returns True if a watch has started, false if it's being watched. - */ - bool TryWatch(VKFence& new_fence); - - void OnFenceRemoval(VKFence* signaling_fence) override; - - /** - * Do not use it paired with Watch. Use TryWatch instead. - * Returns true when the watch is free. - */ - bool IsUsed() const { - return fence != nullptr; - } - -private: - VKFence* fence{}; ///< Fence watching this resource. nullptr when the watch is free. -}; - -/** - * Handles a pool of resources protected by fences. Manages resource overflow allocating more - * resources. - */ -class VKFencedPool { -public: - explicit VKFencedPool(std::size_t grow_step); - virtual ~VKFencedPool(); - -protected: - /** - * Commits a free resource and protects it with a fence. It may allocate new resources. - * @param fence Fence that protects the commited resource. - * @returns Index of the resource commited. - */ - std::size_t CommitResource(VKFence& fence); - - /// Called when a chunk of resources have to be allocated. - virtual void Allocate(std::size_t begin, std::size_t end) = 0; - -private: - /// Manages pool overflow allocating new resources. - std::size_t ManageOverflow(); - - /// Allocates a new page of resources. - void Grow(); - - std::size_t grow_step = 0; ///< Number of new resources created after an overflow - std::size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found - std::vector<std::unique_ptr<VKFenceWatch>> watches; ///< Set of watched resources -}; - -/** - * The resource manager handles all resources that can be protected with a fence avoiding - * driver-side or GPU-side concurrent usage. Usage is documented in VKFence. - */ -class VKResourceManager final { -public: - explicit VKResourceManager(const VKDevice& device); - ~VKResourceManager(); - - /// Commits a fence. It has to be sent to a queue and released. - VKFence& CommitFence(); - - /// Commits an unused command buffer and protects it with a fence. - VkCommandBuffer CommitCommandBuffer(VKFence& fence); - -private: - /// Allocates new fences. - void GrowFences(std::size_t new_fences_count); - - const VKDevice& device; ///< Device handler. - std::size_t fences_iterator = 0; ///< Index where a free fence is likely to be found. - std::vector<std::unique_ptr<VKFence>> fences; ///< Pool of fences. - std::unique_ptr<CommandBufferPool> command_buffer_pool; ///< Pool of command buffers. -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp new file mode 100644 index 000000000..a8bf7bda8 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp @@ -0,0 +1,63 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <optional> + +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" + +namespace Vulkan { + +ResourcePool::ResourcePool(MasterSemaphore& master_semaphore_, size_t grow_step_) + : master_semaphore{master_semaphore_}, grow_step{grow_step_} {} + +ResourcePool::~ResourcePool() = default; + +size_t ResourcePool::CommitResource() { + // Refresh semaphore to query updated results + master_semaphore.Refresh(); + const u64 gpu_tick = master_semaphore.KnownGpuTick(); + const auto search = [this, gpu_tick](size_t begin, size_t end) -> std::optional<size_t> { + for (size_t iterator = begin; iterator < end; ++iterator) { + if (gpu_tick >= ticks[iterator]) { + ticks[iterator] = master_semaphore.CurrentTick(); + return iterator; + } + } + return std::nullopt; + }; + // Try to find a free resource from the hinted position to the end. + std::optional<size_t> found = search(hint_iterator, ticks.size()); + if (!found) { + // Search from beginning to the hinted position. + found = search(0, hint_iterator); + if (!found) { + // Both searches failed, the pool is full; handle it. + const size_t free_resource = ManageOverflow(); + + ticks[free_resource] = master_semaphore.CurrentTick(); + found = free_resource; + } + } + // Free iterator is hinted to the resource after the one that's been commited. + hint_iterator = (*found + 1) % ticks.size(); + return *found; +} + +size_t ResourcePool::ManageOverflow() { + const size_t old_capacity = ticks.size(); + Grow(); + + // The last entry is guaranted to be free, since it's the first element of the freshly + // allocated resources. + return old_capacity; +} + +void ResourcePool::Grow() { + const size_t old_capacity = ticks.size(); + ticks.resize(old_capacity + grow_step); + Allocate(old_capacity, old_capacity + grow_step); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h new file mode 100644 index 000000000..9d0bb3b4d --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_resource_pool.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include "common/common_types.h" + +namespace Vulkan { + +class MasterSemaphore; + +/** + * Handles a pool of resources protected by fences. Manages resource overflow allocating more + * resources. + */ +class ResourcePool { +public: + explicit ResourcePool(MasterSemaphore& master_semaphore, size_t grow_step); + virtual ~ResourcePool(); + +protected: + size_t CommitResource(); + + /// Called when a chunk of resources have to be allocated. + virtual void Allocate(size_t begin, size_t end) = 0; + +private: + /// Manages pool overflow allocating new resources. + size_t ManageOverflow(); + + /// Allocates a new page of resources. + void Grow(); + + MasterSemaphore& master_semaphore; + size_t grow_step = 0; ///< Number of new resources created after an overflow + size_t hint_iterator = 0; ///< Hint to where the next free resources is likely to be found + std::vector<u64> ticks; ///< Ticks for each resource +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp deleted file mode 100644 index b068888f9..000000000 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <unordered_map> - -#include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/textures/texture.h" - -using Tegra::Texture::TextureMipmapFilter; - -namespace Vulkan { - -namespace { - -VkBorderColor ConvertBorderColor(std::array<float, 4> color) { - // TODO(Rodrigo): Manage integer border colors - if (color == std::array<float, 4>{0, 0, 0, 0}) { - return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - } else if (color == std::array<float, 4>{0, 0, 0, 1}) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - } else if (color == std::array<float, 4>{1, 1, 1, 1}) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; - } - if (color[0] + color[1] + color[2] > 1.35f) { - // If color elements are brighter than roughly 0.5 average, use white border - return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; - } else if (color[3] > 0.5f) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - } else { - return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - } -} - -} // Anonymous namespace - -VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {} - -VKSamplerCache::~VKSamplerCache() = default; - -vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const { - const bool arbitrary_borders = device.IsExtCustomBorderColorSupported(); - const std::array color = tsc.GetBorderColor(); - - VkSamplerCustomBorderColorCreateInfoEXT border{ - .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT, - .pNext = nullptr, - .customBorderColor = {}, - .format = VK_FORMAT_UNDEFINED, - }; - std::memcpy(&border.customBorderColor, color.data(), sizeof(color)); - - return device.GetLogical().CreateSampler({ - .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, - .pNext = arbitrary_borders ? &border : nullptr, - .flags = 0, - .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter), - .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter), - .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), - .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), - .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), - .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), - .mipLodBias = tsc.GetLodBias(), - .anisotropyEnable = - static_cast<VkBool32>(tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE), - .maxAnisotropy = tsc.GetMaxAnisotropy(), - .compareEnable = tsc.depth_compare_enabled, - .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), - .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(), - .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(), - .borderColor = - arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), - .unnormalizedCoordinates = VK_FALSE, - }); -} - -VkSampler VKSamplerCache::ToSamplerType(const vk::Sampler& sampler) const { - return *sampler; -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.h b/src/video_core/renderer_vulkan/vk_sampler_cache.h deleted file mode 100644 index a33d1c0ee..000000000 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/sampler_cache.h" -#include "video_core/textures/texture.h" - -namespace Vulkan { - -class VKDevice; - -class VKSamplerCache final : public VideoCommon::SamplerCache<VkSampler, vk::Sampler> { -public: - explicit VKSamplerCache(const VKDevice& device); - ~VKSamplerCache(); - -protected: - vk::Sampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - - VkSampler ToSamplerType(const vk::Sampler& sampler) const override; - -private: - const VKDevice& device; -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index dbbd0961a..f35c120b0 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -10,12 +10,14 @@ #include "common/microprofile.h" #include "common/thread.h" -#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_command_pool.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -35,10 +37,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) { last = nullptr; } -VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, - StateTracker& state_tracker) - : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker}, - next_fence{&resource_manager.CommitFence()} { +VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_) + : device{device_}, state_tracker{state_tracker_}, + master_semaphore{std::make_unique<MasterSemaphore>(device)}, + command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} { AcquireNewChunk(); AllocateNewContext(); worker_thread = std::thread(&VKScheduler::WorkerThread, this); @@ -50,20 +52,15 @@ VKScheduler::~VKScheduler() { worker_thread.join(); } -void VKScheduler::Flush(bool release_fence, VkSemaphore semaphore) { +void VKScheduler::Flush(VkSemaphore semaphore) { SubmitExecution(semaphore); - if (release_fence) { - current_fence->Release(); - } AllocateNewContext(); } -void VKScheduler::Finish(bool release_fence, VkSemaphore semaphore) { +void VKScheduler::Finish(VkSemaphore semaphore) { + const u64 presubmit_tick = CurrentTick(); SubmitExecution(semaphore); - current_fence->Wait(); - if (release_fence) { - current_fence->Release(); - } + Wait(presubmit_tick); AllocateNewContext(); } @@ -88,38 +85,39 @@ void VKScheduler::DispatchWork() { AcquireNewChunk(); } -void VKScheduler::RequestRenderpass(VkRenderPass renderpass, VkFramebuffer framebuffer, - VkExtent2D render_area) { - if (renderpass == state.renderpass && framebuffer == state.framebuffer && +void VKScheduler::RequestRenderpass(const Framebuffer* framebuffer) { + const VkRenderPass renderpass = framebuffer->RenderPass(); + const VkFramebuffer framebuffer_handle = framebuffer->Handle(); + const VkExtent2D render_area = framebuffer->RenderArea(); + if (renderpass == state.renderpass && framebuffer_handle == state.framebuffer && render_area.width == state.render_area.width && render_area.height == state.render_area.height) { return; } - const bool end_renderpass = state.renderpass != nullptr; + EndRenderPass(); state.renderpass = renderpass; - state.framebuffer = framebuffer; + state.framebuffer = framebuffer_handle; state.render_area = render_area; - const VkRenderPassBeginInfo renderpass_bi{ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .pNext = nullptr, - .renderPass = renderpass, - .framebuffer = framebuffer, - .renderArea = - { - .offset = {.x = 0, .y = 0}, - .extent = render_area, - }, - .clearValueCount = 0, - .pClearValues = nullptr, - }; - - Record([renderpass_bi, end_renderpass](vk::CommandBuffer cmdbuf) { - if (end_renderpass) { - cmdbuf.EndRenderPass(); - } + Record([renderpass, framebuffer_handle, render_area](vk::CommandBuffer cmdbuf) { + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = renderpass, + .framebuffer = framebuffer_handle, + .renderArea = + { + .offset = {.x = 0, .y = 0}, + .extent = render_area, + }, + .clearValueCount = 0, + .pClearValues = nullptr, + }; cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); }); + num_renderpass_images = framebuffer->NumImages(); + renderpass_images = framebuffer->Images(); + renderpass_image_ranges = framebuffer->ImageRanges(); } void VKScheduler::RequestOutsideRenderPassOperationContext() { @@ -160,18 +158,38 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) { current_cmdbuf.End(); + const VkSemaphore timeline_semaphore = master_semaphore->Handle(); + const u32 num_signal_semaphores = semaphore ? 2U : 1U; + + const u64 signal_value = master_semaphore->CurrentTick(); + const u64 wait_value = signal_value - 1; + const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + + master_semaphore->NextTick(); + + const std::array signal_values{signal_value, u64(0)}; + const std::array signal_semaphores{timeline_semaphore, semaphore}; + + const VkTimelineSemaphoreSubmitInfoKHR timeline_si{ + .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, + .pNext = nullptr, + .waitSemaphoreValueCount = 1, + .pWaitSemaphoreValues = &wait_value, + .signalSemaphoreValueCount = num_signal_semaphores, + .pSignalSemaphoreValues = signal_values.data(), + }; const VkSubmitInfo submit_info{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, - .pNext = nullptr, - .waitSemaphoreCount = 0, - .pWaitSemaphores = nullptr, - .pWaitDstStageMask = nullptr, + .pNext = &timeline_si, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &timeline_semaphore, + .pWaitDstStageMask = &wait_stage_mask, .commandBufferCount = 1, .pCommandBuffers = current_cmdbuf.address(), - .signalSemaphoreCount = semaphore ? 1U : 0U, - .pSignalSemaphores = &semaphore, + .signalSemaphoreCount = num_signal_semaphores, + .pSignalSemaphores = signal_semaphores.data(), }; - switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info, *current_fence)) { + switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) { case VK_SUCCESS: break; case VK_ERROR_DEVICE_LOST: @@ -183,14 +201,9 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) { } void VKScheduler::AllocateNewContext() { - ++ticks; - std::unique_lock lock{mutex}; - current_fence = next_fence; - next_fence = &resource_manager.CommitFence(); - current_cmdbuf = vk::CommandBuffer(resource_manager.CommitCommandBuffer(*current_fence), - device.GetDispatchLoader()); + current_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader()); current_cmdbuf.Begin({ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .pNext = nullptr, @@ -218,8 +231,37 @@ void VKScheduler::EndRenderPass() { if (!state.renderpass) { return; } + Record([num_images = num_renderpass_images, images = renderpass_images, + ranges = renderpass_image_ranges](vk::CommandBuffer cmdbuf) { + std::array<VkImageMemoryBarrier, 9> barriers; + for (size_t i = 0; i < num_images; ++i) { + barriers[i] = VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = images[i], + .subresourceRange = ranges[i], + }; + } + cmdbuf.EndRenderPass(); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr, + vk::Span(barriers.data(), num_images)); + }); state.renderpass = nullptr; - Record([](vk::CommandBuffer cmdbuf) { cmdbuf.EndRenderPass(); }); + num_renderpass_images = 0; } void VKScheduler::AcquireNewChunk() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 970a65566..3ce48e9d2 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -6,52 +6,37 @@ #include <atomic> #include <condition_variable> +#include <cstddef> #include <memory> #include <stack> #include <thread> #include <utility> +#include "common/alignment.h" #include "common/common_types.h" #include "common/threadsafe_queue.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class CommandPool; +class Device; +class Framebuffer; class StateTracker; -class VKDevice; -class VKFence; class VKQueryCache; -class VKResourceManager; - -class VKFenceView { -public: - VKFenceView() = default; - VKFenceView(VKFence* const& fence) : fence{fence} {} - - VKFence* operator->() const noexcept { - return fence; - } - - operator VKFence&() const noexcept { - return *fence; - } - -private: - VKFence* const& fence; -}; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. class VKScheduler { public: - explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, - StateTracker& state_tracker); + explicit VKScheduler(const Device& device, StateTracker& state_tracker); ~VKScheduler(); /// Sends the current execution context to the GPU. - void Flush(bool release_fence = true, VkSemaphore semaphore = nullptr); + void Flush(VkSemaphore semaphore = nullptr); /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(bool release_fence = true, VkSemaphore semaphore = nullptr); + void Finish(VkSemaphore semaphore = nullptr); /// Waits for the worker thread to finish executing everything. After this function returns it's /// safe to touch worker resources. @@ -61,8 +46,7 @@ public: void DispatchWork(); /// Requests to begin a renderpass. - void RequestRenderpass(VkRenderPass renderpass, VkFramebuffer framebuffer, - VkExtent2D render_area); + void RequestRenderpass(const Framebuffer* framebuffer); /// Requests the current executino context to be able to execute operations only allowed outside /// of a renderpass. @@ -71,6 +55,9 @@ public: /// Binds a pipeline to the current execution context. void BindGraphicsPipeline(VkPipeline pipeline); + /// Invalidates current command buffer state except for render passes + void InvalidateState(); + /// Assigns the query cache. void SetQueryCache(VKQueryCache& query_cache_) { query_cache = &query_cache_; @@ -86,14 +73,24 @@ public: (void)chunk->Record(command); } - /// Gets a reference to the current fence. - VKFenceView GetFence() const { - return current_fence; + /// Returns the current command buffer tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return master_semaphore->CurrentTick(); } - /// Returns the current command buffer tick. - u64 Ticks() const { - return ticks; + /// Returns true when a tick has been triggered by the GPU. + [[nodiscard]] bool IsFree(u64 tick) const noexcept { + return master_semaphore->IsFree(tick); + } + + /// Waits for the given tick to trigger on the GPU. + void Wait(u64 tick) { + master_semaphore->Wait(tick); + } + + /// Returns the master timeline semaphore. + [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept { + return *master_semaphore; } private: @@ -118,7 +115,7 @@ private: template <typename T> class TypedCommand final : public Command { public: - explicit TypedCommand(T&& command) : command{std::move(command)} {} + explicit TypedCommand(T&& command_) : command{std::move(command_)} {} ~TypedCommand() override = default; TypedCommand(TypedCommand&&) = delete; @@ -141,12 +138,11 @@ private: using FuncType = TypedCommand<T>; static_assert(sizeof(FuncType) < sizeof(data), "Lambda is too large"); + command_offset = Common::AlignUp(command_offset, alignof(FuncType)); if (command_offset > sizeof(data) - sizeof(FuncType)) { return false; } - - Command* current_last = last; - + Command* const current_last = last; last = new (data.data() + command_offset) FuncType(std::move(command)); if (current_last) { @@ -154,7 +150,6 @@ private: } else { first = last; } - command_offset += sizeof(FuncType); return true; } @@ -167,8 +162,15 @@ private: Command* first = nullptr; Command* last = nullptr; - std::size_t command_offset = 0; - std::array<u8, 0x8000> data{}; + size_t command_offset = 0; + alignas(std::max_align_t) std::array<u8, 0x8000> data{}; + }; + + struct State { + VkRenderPass renderpass = nullptr; + VkFramebuffer framebuffer = nullptr; + VkExtent2D render_area = {0, 0}; + VkPipeline graphics_pipeline = nullptr; }; void WorkerThread(); @@ -177,39 +179,35 @@ private: void AllocateNewContext(); - void InvalidateState(); - void EndPendingOperations(); void EndRenderPass(); void AcquireNewChunk(); - const VKDevice& device; - VKResourceManager& resource_manager; + const Device& device; StateTracker& state_tracker; + std::unique_ptr<MasterSemaphore> master_semaphore; + std::unique_ptr<CommandPool> command_pool; + VKQueryCache* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; - VKFence* current_fence = nullptr; - VKFence* next_fence = nullptr; - - struct State { - VkRenderPass renderpass = nullptr; - VkFramebuffer framebuffer = nullptr; - VkExtent2D render_area = {0, 0}; - VkPipeline graphics_pipeline = nullptr; - } state; std::unique_ptr<CommandChunk> chunk; std::thread worker_thread; + State state; + + u32 num_renderpass_images = 0; + std::array<VkImage, 9> renderpass_images{}; + std::array<VkImageSubresourceRange, 9> renderpass_image_ranges{}; + Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue; Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex mutex; std::condition_variable cv; - std::atomic<u64> ticks = 0; bool quit = false; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index cd7d7a4e4..40e2e0d38 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -22,11 +22,11 @@ #include "video_core/engines/shader_bytecode.h" #include "video_core/engines/shader_header.h" #include "video_core/engines/shader_type.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" #include "video_core/shader/transform_feedback.h" +#include "video_core/vulkan_common/vulkan_device.h" namespace Vulkan { @@ -55,8 +55,8 @@ enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; class Expression final { public: - Expression(Id id, Type type) : id{id}, type{type} { - ASSERT(type != Type::Void); + Expression(Id id_, Type type_) : id{id_}, type{type_} { + ASSERT(type_ != Type::Void); } Expression() : type{Type::Void} {} @@ -102,7 +102,7 @@ struct GenericVaryingDescription { bool is_scalar = false; }; -spv::Dim GetSamplerDim(const Sampler& sampler) { +spv::Dim GetSamplerDim(const SamplerEntry& sampler) { ASSERT(!sampler.is_buffer); switch (sampler.type) { case Tegra::Shader::TextureType::Texture1D: @@ -114,12 +114,12 @@ spv::Dim GetSamplerDim(const Sampler& sampler) { case Tegra::Shader::TextureType::TextureCube: return spv::Dim::Cube; default: - UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<int>(sampler.type)); + UNIMPLEMENTED_MSG("Unimplemented sampler type={}", sampler.type); return spv::Dim::Dim2D; } } -std::pair<spv::Dim, bool> GetImageDim(const Image& image) { +std::pair<spv::Dim, bool> GetImageDim(const ImageEntry& image) { switch (image.type) { case Tegra::Shader::ImageType::Texture1D: return {spv::Dim::Dim1D, false}; @@ -134,7 +134,7 @@ std::pair<spv::Dim, bool> GetImageDim(const Image& image) { case Tegra::Shader::ImageType::Texture3D: return {spv::Dim::Dim3D, false}; default: - UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(image.type)); + UNIMPLEMENTED_MSG("Unimplemented image type={}", image.type); return {spv::Dim::Dim2D, false}; } } @@ -274,12 +274,12 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler final : public Sirit::Module { public: - explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, - const Registry& registry, const Specialization& specialization) - : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - registry{registry}, specialization{specialization} { - if (stage != ShaderType::Compute) { - transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + explicit SPIRVDecompiler(const Device& device_, const ShaderIR& ir_, ShaderType stage_, + const Registry& registry_, const Specialization& specialization_) + : Module(0x00010300), device{device_}, ir{ir_}, stage{stage_}, header{ir_.GetHeader()}, + registry{registry_}, specialization{specialization_} { + if (stage_ != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry_.GetGraphicsInfo()); } AddCapability(spv::Capability::Shader); @@ -293,6 +293,7 @@ public: AddCapability(spv::Capability::DrawParameters); AddCapability(spv::Capability::SubgroupBallotKHR); AddCapability(spv::Capability::SubgroupVoteKHR); + AddExtension("SPV_KHR_16bit_storage"); AddExtension("SPV_KHR_shader_ballot"); AddExtension("SPV_KHR_subgroup_vote"); AddExtension("SPV_KHR_storage_buffer_storage_class"); @@ -307,7 +308,6 @@ public: "supported on this device"); } } - if (ir.UsesLayer() || ir.UsesViewportIndex()) { if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -317,15 +317,13 @@ public: AddCapability(spv::Capability::ShaderViewportIndexLayerEXT); } } - if (device.IsFormatlessImageLoadSupported()) { AddCapability(spv::Capability::StorageImageReadWithoutFormat); } - if (device.IsFloat16Supported()) { AddCapability(spv::Capability::Float16); } - t_scalar_half = Name(TypeFloat(device.IsFloat16Supported() ? 16 : 32), "scalar_half"); + t_scalar_half = Name(TypeFloat(device_.IsFloat16Supported() ? 16 : 32), "scalar_half"); t_half = Name(TypeVector(t_scalar_half, 2), "half"); const Id main = Decompile(); @@ -369,6 +367,9 @@ public: if (header.ps.omap.depth) { AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); } + if (specialization.early_fragment_tests) { + AddExecutionMode(main, spv::ExecutionMode::EarlyFragmentTests); + } break; case ShaderType::Compute: const auto workgroup_size = specialization.workgroup_size; @@ -972,7 +973,7 @@ private: return binding; } - void DeclareImage(const Image& image, u32& binding) { + void DeclareImage(const ImageEntry& image, u32& binding) { const auto [dim, arrayed] = GetImageDim(image); constexpr int depth = 0; constexpr bool ms = false; @@ -1080,9 +1081,9 @@ private: indices.point_size = AddBuiltIn(t_float, spv::BuiltIn::PointSize, "point_size"); } - const auto& output_attributes = ir.GetOutputAttributes(); - const bool declare_clip_distances = - std::any_of(output_attributes.begin(), output_attributes.end(), [](const auto& index) { + const auto& ir_output_attributes = ir.GetOutputAttributes(); + const bool declare_clip_distances = std::any_of( + ir_output_attributes.begin(), ir_output_attributes.end(), [](const auto& index) { return index == Attribute::Index::ClipDistances0123 || index == Attribute::Index::ClipDistances4567; }); @@ -1246,7 +1247,7 @@ private: const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); return {OpLoad(GetTypeDefinition(type), pointer), type}; } - UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); + UNIMPLEMENTED_MSG("Unhandled input attribute: {}", attribute); return {v_float_zero, Type::Float}; } @@ -1333,7 +1334,10 @@ private: } if (const auto comment = std::get_if<CommentNode>(&*node)) { - Name(OpUndef(t_void), comment->GetText()); + if (device.HasDebuggingToolAttached()) { + // We should insert comments with OpString instead of using named variables + Name(OpUndef(t_int), comment->GetText()); + } return {}; } @@ -1882,7 +1886,7 @@ private: case Tegra::Shader::TextureType::Texture3D: return 3; default: - UNREACHABLE_MSG("Invalid texture type={}", static_cast<int>(type)); + UNREACHABLE_MSG("Invalid texture type={}", type); return 2; } }(); @@ -2067,6 +2071,46 @@ private: return {}; } + Id MaxwellToSpirvComparison(Maxwell::ComparisonOp compare_op, Id operand_1, Id operand_2) { + using Compare = Maxwell::ComparisonOp; + switch (compare_op) { + case Compare::NeverOld: + return v_false; // Never let the test pass + case Compare::LessOld: + return OpFOrdLessThan(t_bool, operand_1, operand_2); + case Compare::EqualOld: + return OpFOrdEqual(t_bool, operand_1, operand_2); + case Compare::LessEqualOld: + return OpFOrdLessThanEqual(t_bool, operand_1, operand_2); + case Compare::GreaterOld: + return OpFOrdGreaterThan(t_bool, operand_1, operand_2); + case Compare::NotEqualOld: + return OpFOrdNotEqual(t_bool, operand_1, operand_2); + case Compare::GreaterEqualOld: + return OpFOrdGreaterThanEqual(t_bool, operand_1, operand_2); + default: + UNREACHABLE(); + return v_true; + } + } + + void AlphaTest(Id pointer) { + if (specialization.alpha_test_func == Maxwell::ComparisonOp::AlwaysOld) { + return; + } + const Id true_label = OpLabel(); + const Id discard_label = OpLabel(); + const Id alpha_reference = Constant(t_float, specialization.alpha_test_ref); + const Id alpha_value = OpLoad(t_float, pointer); + const Id condition = + MaxwellToSpirvComparison(specialization.alpha_test_func, alpha_value, alpha_reference); + + OpBranchConditional(condition, true_label, discard_label); + AddLabel(discard_label); + OpKill(); + AddLabel(true_label); + } + void PreExit() { if (stage == ShaderType::Vertex && specialization.ndc_minus_one_to_one) { const u32 position_index = out_indices.position.value(); @@ -2078,8 +2122,7 @@ private: OpStore(z_pointer, depth); } if (stage == ShaderType::Fragment) { - const auto SafeGetRegister = [&](u32 reg) { - // TODO(Rodrigo): Replace with contains once C++20 releases + const auto SafeGetRegister = [this](u32 reg) { if (const auto it = registers.find(reg); it != registers.end()) { return OpLoad(t_float, it->second); } @@ -2089,8 +2132,6 @@ private: UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented"); - // TODO(Rodrigo): Alpha testing - // Write the color outputs using the data in the shader registers, disabled // rendertargets/components are skipped in the register assignment. u32 current_reg = 0; @@ -2102,6 +2143,9 @@ private: } const Id pointer = AccessElement(t_out_float, frag_colors[rt], component); OpStore(pointer, SafeGetRegister(current_reg)); + if (rt == 0 && component == 3) { + AlphaTest(pointer); + } ++current_reg; } } @@ -2701,7 +2745,7 @@ private: }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); - const VKDevice& device; + const Device& device; const ShaderIR& ir; const ShaderType stage; const Tegra::Shader::Header header; @@ -2843,7 +2887,7 @@ private: class ExprDecompiler { public: - explicit ExprDecompiler(SPIRVDecompiler& decomp) : decomp{decomp} {} + explicit ExprDecompiler(SPIRVDecompiler& decomp_) : decomp{decomp_} {} Id operator()(const ExprAnd& expr) { const Id type_def = decomp.GetTypeDefinition(Type::Bool); @@ -2899,7 +2943,7 @@ private: class ASTDecompiler { public: - explicit ASTDecompiler(SPIRVDecompiler& decomp) : decomp{decomp} {} + explicit ASTDecompiler(SPIRVDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); @@ -3062,7 +3106,11 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.const_buffers.emplace_back(cbuf.second, cbuf.first); } for (const auto& [base, usage] : ir.GetGlobalMemory()) { - entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_written); + entries.global_buffers.emplace_back(GlobalBufferEntry{ + .cbuf_index = base.cbuf_index, + .cbuf_offset = base.cbuf_offset, + .is_written = usage.is_written, + }); } for (const auto& sampler : ir.GetSamplers()) { if (sampler.is_buffer) { @@ -3083,13 +3131,16 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.attributes.insert(GetGenericAttributeLocation(attribute)); } } + for (const auto& buffer : entries.const_buffers) { + entries.enabled_uniform_buffers |= 1U << buffer.GetIndex(); + } entries.clip_distances = ir.GetClipDistances(); entries.shader_length = ir.GetLength(); entries.uses_warps = ir.UsesWarps(); return entries; } -std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, +std::vector<u32> Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, ShaderType stage, const VideoCommon::Shader::Registry& registry, const Specialization& specialization) { return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble(); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 2b0e90396..5d94132a5 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -15,23 +15,21 @@ #include "video_core/shader/shader_ir.h" namespace Vulkan { -class VKDevice; -} -namespace Vulkan { +class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using UniformTexelEntry = VideoCommon::Shader::Sampler; -using SamplerEntry = VideoCommon::Shader::Sampler; -using StorageTexelEntry = VideoCommon::Shader::Image; -using ImageEntry = VideoCommon::Shader::Image; +using UniformTexelEntry = VideoCommon::Shader::SamplerEntry; +using SamplerEntry = VideoCommon::Shader::SamplerEntry; +using StorageTexelEntry = VideoCommon::Shader::ImageEntry; +using ImageEntry = VideoCommon::Shader::ImageEntry; constexpr u32 DESCRIPTOR_SET = 0; class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer { public: - explicit constexpr ConstBufferEntry(const VideoCommon::Shader::ConstBuffer& entry, u32 index) - : VideoCommon::Shader::ConstBuffer{entry}, index{index} {} + explicit constexpr ConstBufferEntry(const ConstBuffer& entry_, u32 index_) + : ConstBuffer{entry_}, index{index_} {} constexpr u32 GetIndex() const { return index; @@ -41,24 +39,7 @@ private: u32 index{}; }; -class GlobalBufferEntry { -public: - constexpr explicit GlobalBufferEntry(u32 cbuf_index, u32 cbuf_offset, bool is_written) - : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_written{is_written} {} - - constexpr u32 GetCbufIndex() const { - return cbuf_index; - } - - constexpr u32 GetCbufOffset() const { - return cbuf_offset; - } - - constexpr bool IsWritten() const { - return is_written; - } - -private: +struct GlobalBufferEntry { u32 cbuf_index{}; u32 cbuf_offset{}; bool is_written{}; @@ -80,6 +61,7 @@ struct ShaderEntries { std::set<u32> attributes; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; std::size_t shader_length{}; + u32 enabled_uniform_buffers{}; bool uses_warps{}; }; @@ -95,6 +77,9 @@ struct Specialization final { std::bitset<Maxwell::NumVertexAttributes> enabled_attributes; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; + bool early_fragment_tests{}; + float alpha_test_ref{}; + Maxwell::ComparisonOp alpha_test_func{}; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); @@ -106,7 +91,7 @@ struct SPIRVShader { ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); -std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, +std::vector<u32> Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, Tegra::Engines::ShaderType stage, const VideoCommon::Shader::Registry& registry, const Specialization& specialization); diff --git a/src/video_core/renderer_vulkan/vk_shader_util.cpp b/src/video_core/renderer_vulkan/vk_shader_util.cpp index c1a218d76..aaad4f292 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_util.cpp @@ -7,24 +7,19 @@ #include "common/assert.h" #include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_shader_util.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, const u8* code_data) { - // Avoid undefined behavior by copying to a staging allocation - ASSERT(code_size % sizeof(u32) == 0); - const auto data = std::make_unique<u32[]>(code_size / sizeof(u32)); - std::memcpy(data.get(), code_data, code_size); - +vk::ShaderModule BuildShader(const Device& device, std::span<const u32> code) { return device.GetLogical().CreateShaderModule({ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .codeSize = code_size, - .pCode = data.get(), + .codeSize = static_cast<u32>(code.size_bytes()), + .pCode = code.data(), }); } diff --git a/src/video_core/renderer_vulkan/vk_shader_util.h b/src/video_core/renderer_vulkan/vk_shader_util.h index d1d3f3cae..9517cbe84 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.h +++ b/src/video_core/renderer_vulkan/vk_shader_util.h @@ -4,13 +4,15 @@ #pragma once +#include <span> + #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; -vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, const u8* code_data); +vk::ShaderModule BuildShader(const Device& device, std::span<const u32> code); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 5eca0ab91..7a1232497 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -3,76 +3,214 @@ // Refer to the license.txt file included. #include <algorithm> -#include <unordered_map> #include <utility> #include <vector> +#include <fmt/format.h> + +#include "common/alignment.h" +#include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +namespace { +// Maximum potential alignment of a Vulkan buffer +constexpr VkDeviceSize MAX_ALIGNMENT = 256; +// Maximum size to put elements in the stream buffer +constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024; +// Stream buffer size in bytes +constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; -VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, - u64 last_epoch) - : buffer{std::move(buffer)}, watch{fence}, last_epoch{last_epoch} {} +constexpr VkMemoryPropertyFlags HOST_FLAGS = + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; +constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; -VKStagingBufferPool::StagingBuffer::StagingBuffer(StagingBuffer&& rhs) noexcept { - buffer = std::move(rhs.buffer); - watch = std::move(rhs.watch); - last_epoch = rhs.last_epoch; +bool IsStreamHeap(VkMemoryHeap heap) noexcept { + return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; } -VKStagingBufferPool::StagingBuffer::~StagingBuffer() = default; +std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, + VkMemoryPropertyFlags flags) noexcept { + for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { + if (((type_mask >> type_index) & 1) == 0) { + // Memory type is incompatible + continue; + } + const VkMemoryType& memory_type = props.memoryTypes[type_index]; + if ((memory_type.propertyFlags & flags) != flags) { + // Memory type doesn't have the flags we want + continue; + } + if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { + // Memory heap is not suitable for streaming + continue; + } + // Success! + return type_index; + } + return std::nullopt; +} -VKStagingBufferPool::StagingBuffer& VKStagingBufferPool::StagingBuffer::operator=( - StagingBuffer&& rhs) noexcept { - buffer = std::move(rhs.buffer); - watch = std::move(rhs.watch); - last_epoch = rhs.last_epoch; - return *this; +u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) { + // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this + std::optional<u32> type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); + if (type) { + return *type; + } + // Otherwise try without the DEVICE_LOCAL_BIT + type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); + if (type) { + return *type; + } + // This should never happen, and in case it does, signal it as an out of memory situation + throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY); } -VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler) - : device{device}, memory_manager{memory_manager}, scheduler{scheduler} {} +size_t Region(size_t iterator) noexcept { + return iterator / REGION_SIZE; +} +} // Anonymous namespace -VKStagingBufferPool::~VKStagingBufferPool() = default; +StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, + VKScheduler& scheduler_) + : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { + const vk::Device& dev = device.GetLogical(); + stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = STREAM_BUFFER_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + if (device.HasDebuggingToolAttached()) { + stream_buffer.SetObjectNameEXT("Stream Buffer"); + } + VkMemoryDedicatedRequirements dedicated_reqs{ + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS, + .pNext = nullptr, + .prefersDedicatedAllocation = VK_FALSE, + .requiresDedicatedAllocation = VK_FALSE, + }; + const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs); + const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE || + dedicated_reqs.requiresDedicatedAllocation == VK_TRUE; + const VkMemoryDedicatedAllocateInfo dedicated_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .pNext = nullptr, + .image = nullptr, + .buffer = *stream_buffer, + }; + const auto memory_properties = device.GetPhysical().GetMemoryProperties(); + stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = make_dedicated ? &dedicated_info : nullptr, + .allocationSize = requirements.size, + .memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits), + }); + if (device.HasDebuggingToolAttached()) { + stream_memory.SetObjectNameEXT("Stream Buffer Memory"); + } + stream_buffer.BindMemory(*stream_memory, 0); + stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); +} -VKBuffer& VKStagingBufferPool::GetUnusedBuffer(std::size_t size, bool host_visible) { - if (const auto buffer = TryGetReservedBuffer(size, host_visible)) { - return *buffer; +StagingBufferPool::~StagingBufferPool() = default; + +StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) { + if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) { + return GetStreamBuffer(size); } - return CreateStagingBuffer(size, host_visible); + return GetStagingBuffer(size, usage); } -void VKStagingBufferPool::TickFrame() { - ++epoch; - current_delete_level = (current_delete_level + 1) % NumLevels; +void StagingBufferPool::TickFrame() { + current_delete_level = (current_delete_level + 1) % NUM_LEVELS; - ReleaseCache(true); - ReleaseCache(false); + ReleaseCache(MemoryUsage::DeviceLocal); + ReleaseCache(MemoryUsage::Upload); + ReleaseCache(MemoryUsage::Download); } -VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_visible) { - for (auto& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) { - if (entry.watch.TryWatch(scheduler.GetFence())) { - entry.last_epoch = epoch; - return &*entry.buffer; +StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { + if (AreRegionsActive(Region(free_iterator) + 1, + std::min(Region(iterator + size) + 1, NUM_SYNCS))) { + // Avoid waiting for the previous usages to be free + return GetStagingBuffer(size, MemoryUsage::Upload); + } + const u64 current_tick = scheduler.CurrentTick(); + std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator), + current_tick); + used_iterator = iterator; + free_iterator = std::max(free_iterator, iterator + size); + + if (iterator + size >= STREAM_BUFFER_SIZE) { + std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, + current_tick); + used_iterator = 0; + iterator = 0; + free_iterator = size; + + if (AreRegionsActive(0, Region(size) + 1)) { + // Avoid waiting for the previous usages to be free + return GetStagingBuffer(size, MemoryUsage::Upload); } } - return nullptr; + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return StagingBufferRef{ + .buffer = *stream_buffer, + .offset = static_cast<VkDeviceSize>(offset), + .mapped_span = std::span<u8>(stream_pointer + offset, size), + }; } -VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) { - const u32 log2 = Common::Log2Ceil64(size); +bool StagingBufferPool::AreRegionsActive(size_t region_begin, size_t region_end) const { + const u64 gpu_tick = scheduler.GetMasterSemaphore().KnownGpuTick(); + return std::any_of(sync_ticks.begin() + region_begin, sync_ticks.begin() + region_end, + [gpu_tick](u64 sync_tick) { return gpu_tick < sync_tick; }); +}; - auto buffer = std::make_unique<VKBuffer>(); - buffer->handle = device.GetLogical().CreateBuffer({ +StagingBufferRef StagingBufferPool::GetStagingBuffer(size_t size, MemoryUsage usage) { + if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) { + return *ref; + } + return CreateStagingBuffer(size, usage); +} + +std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size, + MemoryUsage usage) { + StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)]; + + const auto is_free = [this](const StagingBuffer& entry) { + return scheduler.IsFree(entry.tick); + }; + auto& entries = cache_level.entries; + const auto hint_it = entries.begin() + cache_level.iterate_index; + auto it = std::find_if(entries.begin() + cache_level.iterate_index, entries.end(), is_free); + if (it == entries.end()) { + it = std::find_if(entries.begin(), hint_it, is_free); + if (it == hint_it) { + return std::nullopt; + } + } + cache_level.iterate_index = std::distance(entries.begin(), it) + 1; + it->tick = scheduler.CurrentTick(); + return it->Ref(); +} + +StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage usage) { + const u32 log2 = Common::Log2Ceil64(size); + vk::Buffer buffer = device.GetLogical().CreateBuffer({ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -84,48 +222,63 @@ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_v .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }); - buffer->commit = memory_manager.Commit(buffer->handle, host_visible); - - auto& entries = GetCache(host_visible)[log2].entries; - return *entries.emplace_back(std::move(buffer), scheduler.GetFence(), epoch).buffer; -} + if (device.HasDebuggingToolAttached()) { + ++buffer_index; + buffer.SetObjectNameEXT(fmt::format("Staging Buffer {}", buffer_index).c_str()); + } + MemoryCommit commit = memory_allocator.Commit(buffer, usage); + const std::span<u8> mapped_span = IsHostVisible(usage) ? commit.Map() : std::span<u8>{}; -VKStagingBufferPool::StagingBuffersCache& VKStagingBufferPool::GetCache(bool host_visible) { - return host_visible ? host_staging_buffers : device_staging_buffers; + StagingBuffer& entry = GetCache(usage)[log2].entries.emplace_back(StagingBuffer{ + .buffer = std::move(buffer), + .commit = std::move(commit), + .mapped_span = mapped_span, + .tick = scheduler.CurrentTick(), + }); + return entry.Ref(); } -void VKStagingBufferPool::ReleaseCache(bool host_visible) { - auto& cache = GetCache(host_visible); - const u64 size = ReleaseLevel(cache, current_delete_level); - if (size == 0) { - return; +StagingBufferPool::StagingBuffersCache& StagingBufferPool::GetCache(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + return device_local_cache; + case MemoryUsage::Upload: + return upload_cache; + case MemoryUsage::Download: + return download_cache; + default: + UNREACHABLE_MSG("Invalid memory usage={}", usage); + return upload_cache; } } -u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t log2) { - static constexpr std::size_t deletions_per_tick = 16; +void StagingBufferPool::ReleaseCache(MemoryUsage usage) { + ReleaseLevel(GetCache(usage), current_delete_level); +} +void StagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, size_t log2) { + constexpr size_t deletions_per_tick = 16; auto& staging = cache[log2]; auto& entries = staging.entries; - const std::size_t old_size = entries.size(); + const size_t old_size = entries.size(); - const auto is_deleteable = [this](const auto& entry) { - static constexpr u64 epochs_to_destroy = 180; - return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed(); + const auto is_deleteable = [this](const StagingBuffer& entry) { + return scheduler.IsFree(entry.tick); }; - const std::size_t begin_offset = staging.delete_index; - const std::size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size); - const auto begin = std::begin(entries) + begin_offset; - const auto end = std::begin(entries) + end_offset; + const size_t begin_offset = staging.delete_index; + const size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size); + const auto begin = entries.begin() + begin_offset; + const auto end = entries.begin() + end_offset; entries.erase(std::remove_if(begin, end, is_deleteable), end); - const std::size_t new_size = entries.size(); + const size_t new_size = entries.size(); staging.delete_index += deletions_per_tick; if (staging.delete_index >= new_size) { staging.delete_index = 0; } - - return (1ULL << log2) * (old_size - new_size); + if (staging.iterate_index > new_size) { + staging.iterate_index = 0; + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 3c4901437..69f7618de 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -9,73 +9,97 @@ #include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; -class VKFenceWatch; +class Device; class VKScheduler; -struct VKBuffer final { - vk::Buffer handle; - VKMemoryCommit commit; +struct StagingBufferRef { + VkBuffer buffer; + VkDeviceSize offset; + std::span<u8> mapped_span; }; -class VKStagingBufferPool final { +class StagingBufferPool { public: - explicit VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler); - ~VKStagingBufferPool(); + static constexpr size_t NUM_SYNCS = 16; - VKBuffer& GetUnusedBuffer(std::size_t size, bool host_visible); + explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, + VKScheduler& scheduler); + ~StagingBufferPool(); + + StagingBufferRef Request(size_t size, MemoryUsage usage); void TickFrame(); private: - struct StagingBuffer final { - explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, u64 last_epoch); - StagingBuffer(StagingBuffer&& rhs) noexcept; - StagingBuffer(const StagingBuffer&) = delete; - ~StagingBuffer(); - - StagingBuffer& operator=(StagingBuffer&& rhs) noexcept; + struct StreamBufferCommit { + size_t upper_bound; + u64 tick; + }; - std::unique_ptr<VKBuffer> buffer; - VKFenceWatch watch; - u64 last_epoch = 0; + struct StagingBuffer { + vk::Buffer buffer; + MemoryCommit commit; + std::span<u8> mapped_span; + u64 tick = 0; + + StagingBufferRef Ref() const noexcept { + return { + .buffer = *buffer, + .offset = 0, + .mapped_span = mapped_span, + }; + } }; - struct StagingBuffers final { + struct StagingBuffers { std::vector<StagingBuffer> entries; - std::size_t delete_index = 0; + size_t delete_index = 0; + size_t iterate_index = 0; }; - static constexpr std::size_t NumLevels = sizeof(std::size_t) * CHAR_BIT; - using StagingBuffersCache = std::array<StagingBuffers, NumLevels>; + static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT; + using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>; + + StagingBufferRef GetStreamBuffer(size_t size); - VKBuffer* TryGetReservedBuffer(std::size_t size, bool host_visible); + bool AreRegionsActive(size_t region_begin, size_t region_end) const; - VKBuffer& CreateStagingBuffer(std::size_t size, bool host_visible); + StagingBufferRef GetStagingBuffer(size_t size, MemoryUsage usage); - StagingBuffersCache& GetCache(bool host_visible); + std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage); - void ReleaseCache(bool host_visible); + StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage); - u64 ReleaseLevel(StagingBuffersCache& cache, std::size_t log2); + StagingBuffersCache& GetCache(MemoryUsage usage); - const VKDevice& device; - VKMemoryManager& memory_manager; + void ReleaseCache(MemoryUsage usage); + + void ReleaseLevel(StagingBuffersCache& cache, size_t log2); + + const Device& device; + MemoryAllocator& memory_allocator; VKScheduler& scheduler; - StagingBuffersCache host_staging_buffers; - StagingBuffersCache device_staging_buffers; + vk::Buffer stream_buffer; + vk::DeviceMemory stream_memory; + u8* stream_pointer = nullptr; + + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + std::array<u64, NUM_SYNCS> sync_ticks{}; - u64 epoch = 0; + StagingBuffersCache device_local_cache; + StagingBuffersCache upload_cache; + StagingBuffersCache download_cache; - std::size_t current_delete_level = 0; + size_t current_delete_level = 0; + u64 buffer_index = 0; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp index 4bd1009f9..956f86845 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <algorithm> +#include <array> #include <cstddef> #include <iterator> @@ -14,12 +15,10 @@ #include "video_core/renderer_vulkan/vk_state_tracker.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace Vulkan { - namespace { - using namespace Dirty; using namespace VideoCommon::Dirty; using Tegra::Engines::Maxwell3D; @@ -29,21 +28,18 @@ using Table = Maxwell3D::DirtyState::Table; using Flags = Maxwell3D::DirtyState::Flags; Flags MakeInvalidationFlags() { + static constexpr int INVALIDATION_FLAGS[]{ + Viewports, Scissors, DepthBias, BlendConstants, DepthBounds, + StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable, + DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, VertexBuffers, + }; Flags flags{}; - flags[Viewports] = true; - flags[Scissors] = true; - flags[DepthBias] = true; - flags[BlendConstants] = true; - flags[DepthBounds] = true; - flags[StencilProperties] = true; - flags[CullMode] = true; - flags[DepthBoundsEnable] = true; - flags[DepthTestEnable] = true; - flags[DepthWriteEnable] = true; - flags[DepthCompareOp] = true; - flags[FrontFace] = true; - flags[StencilOp] = true; - flags[StencilTestEnable] = true; + for (const int flag : INVALIDATION_FLAGS) { + flags[flag] = true; + } + for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) { + flags[index] = true; + } return flags; } @@ -130,15 +126,40 @@ void SetupDirtyStencilTestEnable(Tables& tables) { tables[0][OFF(stencil_enable)] = StencilTestEnable; } -} // Anonymous namespace +void SetupDirtyBlending(Tables& tables) { + tables[0][OFF(color_mask_common)] = Blending; + tables[0][OFF(independent_blend_enable)] = Blending; + FillBlock(tables[0], OFF(color_mask), NUM(color_mask), Blending); + FillBlock(tables[0], OFF(blend), NUM(blend), Blending); + FillBlock(tables[0], OFF(independent_blend), NUM(independent_blend), Blending); +} + +void SetupDirtyInstanceDivisors(Tables& tables) { + static constexpr size_t divisor_offset = 3; + for (size_t index = 0; index < Regs::NumVertexArrays; ++index) { + tables[0][OFF(instanced_arrays) + index] = InstanceDivisors; + tables[0][OFF(vertex_array) + index * NUM(vertex_array[0]) + divisor_offset] = + InstanceDivisors; + } +} -StateTracker::StateTracker(Core::System& system) - : system{system}, invalidation_flags{MakeInvalidationFlags()} {} +void SetupDirtyVertexAttributes(Tables& tables) { + FillBlock(tables[0], OFF(vertex_attrib_format), NUM(vertex_attrib_format), VertexAttributes); +} + +void SetupDirtyViewportSwizzles(Tables& tables) { + static constexpr size_t swizzle_offset = 6; + for (size_t index = 0; index < Regs::NumViewports; ++index) { + tables[0][OFF(viewport_transform) + index * NUM(viewport_transform[0]) + swizzle_offset] = + ViewportSwizzles; + } +} +} // Anonymous namespace -void StateTracker::Initialize() { - auto& dirty = system.GPU().Maxwell3D().dirty; - auto& tables = dirty.tables; - SetupDirtyRenderTargets(tables); +StateTracker::StateTracker(Tegra::GPU& gpu) + : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} { + auto& tables = gpu.Maxwell3D().dirty.tables; + SetupDirtyFlags(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); SetupDirtyDepthBias(tables); @@ -153,11 +174,10 @@ void StateTracker::Initialize() { SetupDirtyFrontFace(tables); SetupDirtyStencilOp(tables); SetupDirtyStencilTestEnable(tables); -} - -void StateTracker::InvalidateCommandBufferState() { - system.GPU().Maxwell3D().dirty.flags |= invalidation_flags; - current_topology = INVALID_TOPOLOGY; + SetupDirtyBlending(tables); + SetupDirtyInstanceDivisors(tables); + SetupDirtyVertexAttributes(tables); + SetupDirtyViewportSwizzles(tables); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 13a6ce786..84e918a71 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -35,6 +35,11 @@ enum : u8 { StencilOp, StencilTestEnable, + Blending, + InstanceDivisors, + VertexAttributes, + ViewportSwizzles, + Last }; static_assert(Last <= std::numeric_limits<u8>::max()); @@ -45,11 +50,20 @@ class StateTracker { using Maxwell = Tegra::Engines::Maxwell3D::Regs; public: - explicit StateTracker(Core::System& system); + explicit StateTracker(Tegra::GPU& gpu); - void Initialize(); + void InvalidateCommandBufferState() { + flags |= invalidation_flags; + current_topology = INVALID_TOPOLOGY; + } - void InvalidateCommandBufferState(); + void InvalidateViewports() { + flags[Dirty::Viewports] = true; + } + + void InvalidateScissors() { + flags[Dirty::Scissors] = true; + } bool TouchViewports() { return Exchange(Dirty::Viewports, false); @@ -121,13 +135,12 @@ private: static constexpr auto INVALID_TOPOLOGY = static_cast<Maxwell::PrimitiveTopology>(~0u); bool Exchange(std::size_t id, bool new_value) const noexcept { - auto& flags = system.GPU().Maxwell3D().dirty.flags; const bool is_dirty = flags[id]; flags[id] = new_value; return is_dirty; } - Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags; Maxwell::PrimitiveTopology current_topology = INVALID_TOPOLOGY; }; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index a5526a3f5..a09fe084e 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -10,16 +10,19 @@ #include "common/alignment.h" #include "common/assert.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { namespace { +constexpr VkBufferUsageFlags BUFFER_USAGE = + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; @@ -57,17 +60,16 @@ u32 GetMemoryType(const VkPhysicalDeviceMemoryProperties& properties, } // Anonymous namespace -VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler, - VkBufferUsageFlags usage) - : device{device}, scheduler{scheduler} { - CreateBuffers(usage); +VKStreamBuffer::VKStreamBuffer(const Device& device_, VKScheduler& scheduler_) + : device{device_}, scheduler{scheduler_} { + CreateBuffers(); ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); } VKStreamBuffer::~VKStreamBuffer() = default; -std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { +std::pair<u8*, u64> VKStreamBuffer::Map(u64 size, u64 alignment) { ASSERT(size <= stream_buffer_size); mapped_size = size; @@ -77,7 +79,6 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { WaitPendingOperations(offset); - bool invalidated = false; if (offset + size > stream_buffer_size) { // The buffer would overflow, save the amount of used watches and reset the state. invalidation_mark = current_watch_cursor; @@ -91,11 +92,9 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { // Ensure that we don't wait for uncommitted fences. scheduler.Flush(); - - invalidated = true; } - return {memory.Map(offset, size), offset, invalidated}; + return std::make_pair(memory.Map(offset, size), offset); } void VKStreamBuffer::Unmap(u64 size) { @@ -111,23 +110,24 @@ void VKStreamBuffer::Unmap(u64 size) { } auto& watch = current_watches[current_watch_cursor++]; watch.upper_bound = offset; - watch.fence.Watch(scheduler.GetFence()); + watch.tick = scheduler.CurrentTick(); } -void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) { +void VKStreamBuffer::CreateBuffers() { const auto memory_properties = device.GetPhysical().GetMemoryProperties(); const u32 preferred_type = GetMemoryType(memory_properties); const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex; // Substract from the preferred heap size some bytes to avoid getting out of memory. const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size; - const VkDeviceSize allocable_size = heap_size - 9 * 1024 * 1024; + // As per DXVK's example, using `heap_size / 2` + const VkDeviceSize allocable_size = heap_size / 2; buffer = device.GetLogical().CreateBuffer({ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = std::min(PREFERRED_STREAM_BUFFER_SIZE, allocable_size), - .usage = usage, + .usage = BUFFER_USAGE, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, @@ -157,7 +157,7 @@ void VKStreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { while (requested_upper_bound < wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; wait_bound = watch.upper_bound; - watch.fence.Wait(); + scheduler.Wait(watch.tick); ++wait_cursor; } } diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index 689f0d276..2e9c8cb46 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -5,32 +5,29 @@ #pragma once #include <optional> -#include <tuple> +#include <utility> #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; -class VKFence; +class Device; class VKFenceWatch; class VKScheduler; class VKStreamBuffer final { public: - explicit VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler, - VkBufferUsageFlags usage); + explicit VKStreamBuffer(const Device& device, VKScheduler& scheduler); ~VKStreamBuffer(); /** * Reserves a region of memory from the stream buffer. * @param size Size to reserve. - * @returns A tuple in the following order: Raw memory pointer (with offset added), buffer - * offset and a boolean that's true when buffer has been invalidated. + * @returns A pair of a raw memory pointer (with offset added), and the buffer offset */ - std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment); + std::pair<u8*, u64> Map(u64 size, u64 alignment); /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); @@ -44,20 +41,20 @@ public: } private: - struct Watch final { - VKFenceWatch fence; + struct Watch { + u64 tick{}; u64 upper_bound{}; }; /// Creates Vulkan buffer handles committing the required the required memory. - void CreateBuffers(VkBufferUsageFlags usage); + void CreateBuffers(); /// Increases the amount of watches available. void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size); void WaitPendingOperations(u64 requested_upper_bound); - const VKDevice& device; ///< Vulkan device manager. + const Device& device; ///< Vulkan device manager. VKScheduler& scheduler; ///< Command scheduler. vk::Buffer buffer; ///< Mapped buffer. diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 6bfd2abae..0b63bd6c8 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -11,10 +11,10 @@ #include "common/logging/log.h" #include "core/core.h" #include "core/frontend/framebuffer_layout.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -56,8 +56,11 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi } // Anonymous namespace -VKSwapchain::VKSwapchain(VkSurfaceKHR surface, const VKDevice& device) - : surface{surface}, device{device} {} +VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_, + u32 width, u32 height, bool srgb) + : surface{surface_}, device{device_}, scheduler{scheduler_} { + Create(width, height, srgb); +} VKSwapchain::~VKSwapchain() = default; @@ -75,21 +78,18 @@ void VKSwapchain::Create(u32 width, u32 height, bool srgb) { CreateSemaphores(); CreateImageViews(); - fences.resize(image_count, nullptr); + resource_ticks.clear(); + resource_ticks.resize(image_count); } void VKSwapchain::AcquireNextImage() { device.GetLogical().AcquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(), *present_semaphores[frame_index], {}, &image_index); - if (auto& fence = fences[image_index]; fence) { - fence->Wait(); - fence->Release(); - fence = nullptr; - } + scheduler.Wait(resource_ticks[image_index]); } -bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) { +bool VKSwapchain::Present(VkSemaphore render_semaphore) { const VkSemaphore present_semaphore{*present_semaphores[frame_index]}; const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore}; const auto present_queue{device.GetPresentQueue()}; @@ -123,8 +123,7 @@ bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) { break; } - ASSERT(fences[image_index] == nullptr); - fences[image_index] = &fence; + resource_ticks[image_index] = scheduler.CurrentTick(); frame_index = (frame_index + 1) % static_cast<u32>(image_count); return recreated; } diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index a35d61345..a728511e0 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -7,7 +7,7 @@ #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Layout { struct FramebufferLayout; @@ -15,12 +15,13 @@ struct FramebufferLayout; namespace Vulkan { -class VKDevice; -class VKFence; +class Device; +class VKScheduler; class VKSwapchain { public: - explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device); + explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler, + u32 width, u32 height, bool srgb); ~VKSwapchain(); /// Creates (or recreates) the swapchain with a given size. @@ -31,7 +32,7 @@ public: /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be /// recreated. Takes responsability for the ownership of fence. - bool Present(VkSemaphore render_semaphore, VKFence& fence); + bool Present(VkSemaphore render_semaphore); /// Returns true when the framebuffer layout has changed. bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const; @@ -73,7 +74,8 @@ private: void Destroy(); const VkSurfaceKHR surface; - const VKDevice& device; + const Device& device; + VKScheduler& scheduler; vk::SwapchainKHR swapchain; @@ -81,7 +83,7 @@ private: std::vector<VkImage> images; std::vector<vk::ImageView> image_views; std::vector<vk::Framebuffer> framebuffers; - std::vector<VKFence*> fences; + std::vector<u64> resource_ticks; std::vector<vk::Semaphore> present_semaphores; u32 image_index{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 2c6f54101..22a1014a9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -4,614 +4,1167 @@ #include <algorithm> #include <array> -#include <cstddef> -#include <cstring> -#include <memory> -#include <variant> +#include <span> #include <vector> -#include "common/assert.h" -#include "common/common_types.h" -#include "core/core.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/morton.h" +#include "video_core/engines/fermi_2d.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using VideoCore::MortonSwizzle; -using VideoCore::MortonSwizzleMode; - +using Tegra::Engines::Fermi2D; using Tegra::Texture::SwizzleSource; -using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; +using Tegra::Texture::TextureMipmapFilter; +using VideoCommon::BufferImageCopy; +using VideoCommon::ImageInfo; +using VideoCommon::ImageType; +using VideoCommon::SubresourceRange; +using VideoCore::Surface::IsPixelFormatASTC; namespace { -VkImageType SurfaceTargetToImage(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture1DArray: +constexpr std::array ATTACHMENT_REFERENCES{ + VkAttachmentReference{0, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{1, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{2, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{3, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{4, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{5, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{6, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{7, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{8, VK_IMAGE_LAYOUT_GENERAL}, +}; + +constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { + if (color == std::array<float, 4>{0, 0, 0, 0}) { + return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + } else if (color == std::array<float, 4>{0, 0, 0, 1}) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; + } else if (color == std::array<float, 4>{1, 1, 1, 1}) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; + } + if (color[0] + color[1] + color[2] > 1.35f) { + // If color elements are brighter than roughly 0.5 average, use white border + return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; + } else if (color[3] > 0.5f) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; + } else { + return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + } +} + +[[nodiscard]] VkImageType ConvertImageType(const ImageType type) { + switch (type) { + case ImageType::e1D: return VK_IMAGE_TYPE_1D; - case SurfaceTarget::Texture2D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: + case ImageType::e2D: + case ImageType::Linear: return VK_IMAGE_TYPE_2D; - case SurfaceTarget::Texture3D: + case ImageType::e3D: return VK_IMAGE_TYPE_3D; - case SurfaceTarget::TextureBuffer: - UNREACHABLE(); - return {}; + case ImageType::Buffer: + break; } - UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); + UNREACHABLE_MSG("Invalid image type={}", type); return {}; } -VkImageAspectFlags PixelFormatToImageAspect(PixelFormat pixel_format) { - if (pixel_format < PixelFormat::MaxColorFormat) { - return VK_IMAGE_ASPECT_COLOR_BIT; - } else if (pixel_format < PixelFormat::MaxDepthFormat) { - return VK_IMAGE_ASPECT_DEPTH_BIT; - } else if (pixel_format < PixelFormat::MaxDepthStencilFormat) { - return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - } else { - UNREACHABLE_MSG("Invalid pixel format={}", static_cast<int>(pixel_format)); - return VK_IMAGE_ASPECT_COLOR_BIT; +[[nodiscard]] VkSampleCountFlagBits ConvertSampleCount(u32 num_samples) { + switch (num_samples) { + case 1: + return VK_SAMPLE_COUNT_1_BIT; + case 2: + return VK_SAMPLE_COUNT_2_BIT; + case 4: + return VK_SAMPLE_COUNT_4_BIT; + case 8: + return VK_SAMPLE_COUNT_8_BIT; + case 16: + return VK_SAMPLE_COUNT_16_BIT; + default: + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return VK_SAMPLE_COUNT_1_BIT; } } -VkImageViewType GetImageViewType(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - return VK_IMAGE_VIEW_TYPE_1D; - case SurfaceTarget::Texture2D: - return VK_IMAGE_VIEW_TYPE_2D; - case SurfaceTarget::Texture3D: - return VK_IMAGE_VIEW_TYPE_3D; - case SurfaceTarget::Texture1DArray: - return VK_IMAGE_VIEW_TYPE_1D_ARRAY; - case SurfaceTarget::Texture2DArray: - return VK_IMAGE_VIEW_TYPE_2D_ARRAY; - case SurfaceTarget::TextureCubemap: - return VK_IMAGE_VIEW_TYPE_CUBE; - case SurfaceTarget::TextureCubeArray: - return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; - case SurfaceTarget::TextureBuffer: - break; +[[nodiscard]] VkImageUsageFlags ImageUsageFlags(const MaxwellToVK::FormatInfo& info, + PixelFormat format) { + VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT; + if (info.attachable) { + switch (VideoCore::Surface::GetFormatType(format)) { + case VideoCore::Surface::SurfaceType::ColorTexture: + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + break; + case VideoCore::Surface::SurfaceType::Depth: + case VideoCore::Surface::SurfaceType::DepthStencil: + usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + break; + default: + UNREACHABLE_MSG("Invalid surface type"); + } } - UNREACHABLE(); - return {}; + if (info.storage) { + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + } + return usage; } -vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params, - std::size_t host_memory_size) { - // TODO(Rodrigo): Move texture buffer creation to the buffer cache - return device.GetLogical().CreateBuffer({ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, +/// Returns the preferred format for a VkImage +[[nodiscard]] PixelFormat StorageFormat(PixelFormat format) { + switch (format) { + case PixelFormat::A8B8G8R8_SRGB: + return PixelFormat::A8B8G8R8_UNORM; + default: + return format; + } +} + +[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { + const PixelFormat format = StorageFormat(info.format); + const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); + VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + if (info.type == ImageType::e2D && info.resources.layers >= 6 && + info.size.width == info.size.height) { + flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; + } + if (info.type == ImageType::e3D) { + flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; + } + const auto [samples_x, samples_y] = VideoCommon::SamplesLog2(info.num_samples); + return VkImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, - .flags = 0, - .size = static_cast<VkDeviceSize>(host_memory_size), - .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | - VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .flags = flags, + .imageType = ConvertImageType(info.type), + .format = format_info.format, + .extent{ + .width = info.size.width >> samples_x, + .height = info.size.height >> samples_y, + .depth = info.size.depth, + }, + .mipLevels = static_cast<u32>(info.resources.levels), + .arrayLayers = static_cast<u32>(info.resources.layers), + .samples = ConvertSampleCount(info.num_samples), + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = ImageUsageFlags(format_info, format), .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); -} - -VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, - const SurfaceParams& params, VkBuffer buffer, - std::size_t host_memory_size) { - ASSERT(params.IsBuffer()); - - return { - .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .buffer = buffer, - .format = - MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format, - .offset = 0, - .range = static_cast<VkDeviceSize>(host_memory_size), + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }; } -VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) { - ASSERT(!params.IsBuffer()); - - const auto [format, attachable, storage] = - MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format); +[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { + if (info.type == ImageType::Buffer) { + return vk::Image{}; + } + return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); +} - VkImageCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, +[[nodiscard]] vk::Buffer MakeBuffer(const Device& device, const ImageInfo& info) { + if (info.type != ImageType::Buffer) { + return vk::Buffer{}; + } + const size_t bytes_per_block = VideoCore::Surface::BytesPerBlock(info.format); + return device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .imageType = SurfaceTargetToImage(params.target), - .format = format, - .extent = {}, - .mipLevels = params.num_levels, - .arrayLayers = static_cast<u32>(params.GetNumLayers()), - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_OPTIMAL, - .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | - VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + .size = info.size.width * bytes_per_block, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - }; - if (attachable) { - ci.usage |= params.IsPixelFormatZeta() ? VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT - : VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - } - if (storage) { - ci.usage |= VK_IMAGE_USAGE_STORAGE_BIT; - } - - switch (params.target) { - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - ci.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; - [[fallthrough]]; - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2D: - case SurfaceTarget::Texture2DArray: - ci.extent = {params.width, params.height, 1}; - break; - case SurfaceTarget::Texture3D: - ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; - ci.extent = {params.width, params.height, params.depth}; - break; - case SurfaceTarget::TextureBuffer: - UNREACHABLE(); - } - - return ci; + }); } -u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +[[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { + switch (VideoCore::Surface::GetFormatType(format)) { + case VideoCore::Surface::SurfaceType::ColorTexture: + return VK_IMAGE_ASPECT_COLOR_BIT; + case VideoCore::Surface::SurfaceType::Depth: + return VK_IMAGE_ASPECT_DEPTH_BIT; + case VideoCore::Surface::SurfaceType::DepthStencil: + return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + default: + UNREACHABLE_MSG("Invalid surface type"); + return VkImageAspectFlags{}; + } } -} // Anonymous namespace - -CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool, - GPUVAddr gpu_addr, const SurfaceParams& params) - : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, system{system}, - device{device}, resource_manager{resource_manager}, - memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} { - if (params.IsBuffer()) { - buffer = CreateBuffer(device, params, host_memory_size); - commit = memory_manager.Commit(buffer, false); - - const auto buffer_view_ci = - GenerateBufferViewCreateInfo(device, params, *buffer, host_memory_size); - format = buffer_view_ci.format; - - buffer_view = device.GetLogical().CreateBufferView(buffer_view_ci); - } else { - const auto image_ci = GenerateImageCreateInfo(device, params); - format = image_ci.format; - - image.emplace(device, scheduler, image_ci, PixelFormatToImageAspect(params.pixel_format)); - commit = memory_manager.Commit(image->GetHandle(), false); +[[nodiscard]] VkImageAspectFlags ImageViewAspectMask(const VideoCommon::ImageViewInfo& info) { + if (info.IsRenderTarget()) { + return ImageAspectMask(info.format); } - - // TODO(Rodrigo): Move this to a virtual function. - u32 num_layers = 1; - if (params.is_layered || params.target == SurfaceTarget::Texture3D) { - num_layers = params.depth; + const bool is_first = info.Swizzle()[0] == SwizzleSource::R; + switch (info.format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + return is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; + case PixelFormat::S8_UINT_D24_UNORM: + return is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; + case PixelFormat::D16_UNORM: + case PixelFormat::D32_FLOAT: + return VK_IMAGE_ASPECT_DEPTH_BIT; + default: + return VK_IMAGE_ASPECT_COLOR_BIT; } - main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } -CachedSurface::~CachedSurface() = default; - -void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { - // To upload data we have to be outside of a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); +[[nodiscard]] VkAttachmentDescription AttachmentDescription(const Device& device, + const ImageView* image_view) { + using MaxwellToVK::SurfaceFormat; + const PixelFormat pixel_format = image_view->format; + return VkAttachmentDescription{ + .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT, + .format = SurfaceFormat(device, FormatType::Optimal, true, pixel_format).format, + .samples = image_view->Samples(), + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }; +} - if (params.IsBuffer()) { - UploadBuffer(staging_buffer); - } else { - UploadImage(staging_buffer); +[[nodiscard]] VkComponentSwizzle ComponentSwizzle(SwizzleSource swizzle) { + switch (swizzle) { + case SwizzleSource::Zero: + return VK_COMPONENT_SWIZZLE_ZERO; + case SwizzleSource::R: + return VK_COMPONENT_SWIZZLE_R; + case SwizzleSource::G: + return VK_COMPONENT_SWIZZLE_G; + case SwizzleSource::B: + return VK_COMPONENT_SWIZZLE_B; + case SwizzleSource::A: + return VK_COMPONENT_SWIZZLE_A; + case SwizzleSource::OneFloat: + case SwizzleSource::OneInt: + return VK_COMPONENT_SWIZZLE_ONE; } + UNREACHABLE_MSG("Invalid swizzle={}", swizzle); + return VK_COMPONENT_SWIZZLE_ZERO; } -void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { - UNIMPLEMENTED_IF(params.IsBuffer()); - - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { - LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed"); +[[nodiscard]] VkImageViewType ImageViewType(VideoCommon::ImageViewType type) { + switch (type) { + case VideoCommon::ImageViewType::e1D: + return VK_IMAGE_VIEW_TYPE_1D; + case VideoCommon::ImageViewType::e2D: + return VK_IMAGE_VIEW_TYPE_2D; + case VideoCommon::ImageViewType::Cube: + return VK_IMAGE_VIEW_TYPE_CUBE; + case VideoCommon::ImageViewType::e3D: + return VK_IMAGE_VIEW_TYPE_3D; + case VideoCommon::ImageViewType::e1DArray: + return VK_IMAGE_VIEW_TYPE_1D_ARRAY; + case VideoCommon::ImageViewType::e2DArray: + return VK_IMAGE_VIEW_TYPE_2D_ARRAY; + case VideoCommon::ImageViewType::CubeArray: + return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; + case VideoCommon::ImageViewType::Rect: + LOG_WARNING(Render_Vulkan, "Unnormalized image view type not supported"); + return VK_IMAGE_VIEW_TYPE_2D; + case VideoCommon::ImageViewType::Buffer: + UNREACHABLE_MSG("Texture buffers can't be image views"); + return VK_IMAGE_VIEW_TYPE_1D; } + UNREACHABLE_MSG("Invalid image view type={}", type); + return VK_IMAGE_VIEW_TYPE_2D; +} - // We can't copy images to buffers inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); +[[nodiscard]] VkImageSubresourceLayers MakeImageSubresourceLayers( + VideoCommon::SubresourceLayers subresource, VkImageAspectFlags aspect_mask) { + return VkImageSubresourceLayers{ + .aspectMask = aspect_mask, + .mipLevel = static_cast<u32>(subresource.base_level), + .baseArrayLayer = static_cast<u32>(subresource.base_layer), + .layerCount = static_cast<u32>(subresource.num_layers), + }; +} - FullTransition(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); +[[nodiscard]] VkOffset3D MakeOffset3D(VideoCommon::Offset3D offset3d) { + return VkOffset3D{ + .x = offset3d.x, + .y = offset3d.y, + .z = offset3d.z, + }; +} - const auto& buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - // TODO(Rodrigo): Do this in a single copy - for (u32 level = 0; level < params.num_levels; ++level) { - scheduler.Record([image = *image->GetHandle(), buffer = *buffer.handle, - copy = GetBufferImageCopy(level)](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, copy); - }); - } - scheduler.Finish(); +[[nodiscard]] VkExtent3D MakeExtent3D(VideoCommon::Extent3D extent3d) { + return VkExtent3D{ + .width = static_cast<u32>(extent3d.width), + .height = static_cast<u32>(extent3d.height), + .depth = static_cast<u32>(extent3d.depth), + }; +} - // TODO(Rodrigo): Use an intern buffer for staging buffers and avoid this unnecessary memcpy. - std::memcpy(staging_buffer.data(), buffer.commit->Map(host_memory_size), host_memory_size); +[[nodiscard]] VkImageCopy MakeImageCopy(const VideoCommon::ImageCopy& copy, + VkImageAspectFlags aspect_mask) noexcept { + return VkImageCopy{ + .srcSubresource = MakeImageSubresourceLayers(copy.src_subresource, aspect_mask), + .srcOffset = MakeOffset3D(copy.src_offset), + .dstSubresource = MakeImageSubresourceLayers(copy.dst_subresource, aspect_mask), + .dstOffset = MakeOffset3D(copy.dst_offset), + .extent = MakeExtent3D(copy.extent), + }; } -void CachedSurface::DecorateSurfaceName() { - // TODO(Rodrigo): Add name decorations +[[nodiscard]] std::vector<VkBufferCopy> TransformBufferCopies( + std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) { + std::vector<VkBufferCopy> result(copies.size()); + std::ranges::transform( + copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) { + return VkBufferCopy{ + .srcOffset = static_cast<VkDeviceSize>(copy.src_offset + buffer_offset), + .dstOffset = static_cast<VkDeviceSize>(copy.dst_offset), + .size = static_cast<VkDeviceSize>(copy.size), + }; + }); + return result; } -View CachedSurface::CreateView(const ViewParams& params) { - // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params); +[[nodiscard]] std::vector<VkBufferImageCopy> TransformBufferImageCopies( + std::span<const BufferImageCopy> copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) { + struct Maker { + VkBufferImageCopy operator()(const BufferImageCopy& copy) const { + return VkBufferImageCopy{ + .bufferOffset = copy.buffer_offset + buffer_offset, + .bufferRowLength = copy.buffer_row_length, + .bufferImageHeight = copy.buffer_image_height, + .imageSubresource = + { + .aspectMask = aspect_mask, + .mipLevel = static_cast<u32>(copy.image_subresource.base_level), + .baseArrayLayer = static_cast<u32>(copy.image_subresource.base_layer), + .layerCount = static_cast<u32>(copy.image_subresource.num_layers), + }, + .imageOffset = + { + .x = copy.image_offset.x, + .y = copy.image_offset.y, + .z = copy.image_offset.z, + }, + .imageExtent = + { + .width = copy.image_extent.width, + .height = copy.image_extent.height, + .depth = copy.image_extent.depth, + }, + }; + } + size_t buffer_offset; + VkImageAspectFlags aspect_mask; + }; + if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + std::vector<VkBufferImageCopy> result(copies.size() * 2); + std::ranges::transform(copies, result.begin(), + Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT}); + std::ranges::transform(copies, result.begin() + copies.size(), + Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT}); + return result; + } else { + std::vector<VkBufferImageCopy> result(copies.size()); + std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask}); + return result; + } } -void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { - const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); +[[nodiscard]] VkImageSubresourceRange MakeSubresourceRange(VkImageAspectFlags aspect_mask, + const SubresourceRange& range) { + return VkImageSubresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = static_cast<u32>(range.base.level), + .levelCount = static_cast<u32>(range.extent.levels), + .baseArrayLayer = static_cast<u32>(range.base.layer), + .layerCount = static_cast<u32>(range.extent.layers), + }; +} - scheduler.Record([src_buffer = *src_buffer.handle, dst_buffer = *buffer, - size = host_memory_size](vk::CommandBuffer cmdbuf) { - VkBufferCopy copy; - copy.srcOffset = 0; - copy.dstOffset = 0; - copy.size = size; - cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); +[[nodiscard]] VkImageSubresourceRange MakeSubresourceRange(const ImageView* image_view) { + SubresourceRange range = image_view->range; + if (True(image_view->flags & VideoCommon::ImageViewFlagBits::Slice)) { + // Slice image views always affect a single layer, but their subresource range corresponds + // to the slice. Override the value to affect a single layer. + range.base.layer = 0; + range.extent.layers = 1; + } + return MakeSubresourceRange(ImageAspectMask(image_view->format), range); +} - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; // They'll be ignored anyway - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = dst_buffer; - barrier.offset = 0; - barrier.size = size; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - 0, {}, barrier, {}); - }); +[[nodiscard]] VkImageSubresourceLayers MakeSubresourceLayers(const ImageView* image_view) { + return VkImageSubresourceLayers{ + .aspectMask = ImageAspectMask(image_view->format), + .mipLevel = static_cast<u32>(image_view->range.base.level), + .baseArrayLayer = static_cast<u32>(image_view->range.base.layer), + .layerCount = static_cast<u32>(image_view->range.extent.layers), + }; } -void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { - const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); - - FullTransition(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - for (u32 level = 0; level < params.num_levels; ++level) { - const VkBufferImageCopy copy = GetBufferImageCopy(level); - if (image->GetAspectMask() == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - scheduler.Record([buffer = *src_buffer.handle, image = *image->GetHandle(), - copy](vk::CommandBuffer cmdbuf) { - std::array<VkBufferImageCopy, 2> copies = {copy, copy}; - copies[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - copies[1].imageSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - copies); - }); - } else { - scheduler.Record([buffer = *src_buffer.handle, image = *image->GetHandle(), - copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); - }); - } +[[nodiscard]] constexpr SwizzleSource ConvertGreenRed(SwizzleSource value) { + switch (value) { + case SwizzleSource::G: + return SwizzleSource::R; + default: + return value; } } -VkBufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const { - return { - .bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted), - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = +void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, + VkImageAspectFlags aspect_mask, bool is_initialized, + std::span<const VkBufferImageCopy> copies) { + static constexpr VkAccessFlags WRITE_ACCESS_FLAGS = + VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + const VkImageMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = WRITE_ACCESS_FLAGS, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + const VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, + read_barrier); + cmdbuf.CopyBufferToImage(src_buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copies); + // TODO: Move this to another API + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, + write_barrier); +} + +[[nodiscard]] VkImageBlit MakeImageBlit(const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + const VkImageSubresourceLayers& dst_layers, + const VkImageSubresourceLayers& src_layers) { + return VkImageBlit{ + .srcSubresource = src_layers, + .srcOffsets = { - .aspectMask = image->GetAspectMask(), - .mipLevel = level, - .baseArrayLayer = 0, - .layerCount = static_cast<u32>(params.GetNumLayers()), + { + .x = src_region[0].x, + .y = src_region[0].y, + .z = 0, + }, + { + .x = src_region[1].x, + .y = src_region[1].y, + .z = 1, + }, }, - .imageOffset = {.x = 0, .y = 0, .z = 0}, - .imageExtent = + .dstSubresource = dst_layers, + .dstOffsets = { - .width = params.GetMipWidth(level), - .height = params.GetMipHeight(level), - .depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1U, + { + .x = dst_region[0].x, + .y = dst_region[0].y, + .z = 0, + }, + { + .x = dst_region[1].x, + .y = dst_region[1].y, + .z = 1, + }, }, }; } -VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { - return {image->GetAspectMask(), 0, params.num_levels, 0, - static_cast<u32>(params.GetNumLayers())}; +[[nodiscard]] VkImageResolve MakeImageResolve(const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + const VkImageSubresourceLayers& dst_layers, + const VkImageSubresourceLayers& src_layers) { + return VkImageResolve{ + .srcSubresource = src_layers, + .srcOffset = + { + .x = src_region[0].x, + .y = src_region[0].y, + .z = 0, + }, + .dstSubresource = dst_layers, + .dstOffset = + { + .x = dst_region[0].x, + .y = dst_region[0].y, + .z = 0, + }, + .extent = + { + .width = static_cast<u32>(dst_region[1].x - dst_region[0].x), + .height = static_cast<u32>(dst_region[1].y - dst_region[0].y), + .depth = 1, + }, + }; } -CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params) - : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, - image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, - aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_level{params.base_level}, num_levels{params.num_levels}, - image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - base_layer = 0; - num_layers = 1; - base_slice = params.base_layer; - num_slices = params.num_layers; - } else { - base_layer = params.base_layer; - num_layers = params.num_layers; +struct RangedBarrierRange { + u32 min_mip = std::numeric_limits<u32>::max(); + u32 max_mip = std::numeric_limits<u32>::min(); + u32 min_layer = std::numeric_limits<u32>::max(); + u32 max_layer = std::numeric_limits<u32>::min(); + + void AddLayers(const VkImageSubresourceLayers& layers) { + min_mip = std::min(min_mip, layers.mipLevel); + max_mip = std::max(max_mip, layers.mipLevel + 1); + min_layer = std::min(min_layer, layers.baseArrayLayer); + max_layer = std::max(max_layer, layers.baseArrayLayer + layers.layerCount); + } + + VkImageSubresourceRange SubresourceRange(VkImageAspectFlags aspect_mask) const noexcept { + return VkImageSubresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = min_mip, + .levelCount = max_mip - min_mip, + .baseArrayLayer = min_layer, + .layerCount = max_layer - min_layer, + }; } +}; + +} // Anonymous namespace + +void TextureCacheRuntime::Finish() { + scheduler.Finish(); } -CachedSurfaceView::~CachedSurfaceView() = default; +StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.Request(size, MemoryUsage::Upload); +} -VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { - const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (last_image_view && last_swizzle == new_swizzle) { - return last_image_view; - } - last_swizzle = new_swizzle; +StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.Request(size, MemoryUsage::Download); +} - const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); - auto& image_view = entry->second; - if (!is_cache_miss) { - return last_image_view = *image_view; +void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + const VkImageAspectFlags aspect_mask = ImageAspectMask(src.format); + const bool is_dst_msaa = dst.Samples() != VK_SAMPLE_COUNT_1_BIT; + const bool is_src_msaa = src.Samples() != VK_SAMPLE_COUNT_1_BIT; + ASSERT(aspect_mask == ImageAspectMask(dst.format)); + if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) { + blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter, + operation); + return; } - - std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source), - MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)}; - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { - // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. - std::swap(swizzle[0], swizzle[2]); + if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (!device.IsBlitDepthStencilSupported()) { + UNIMPLEMENTED_IF(is_src_msaa || is_dst_msaa); + blit_image_helper.BlitDepthStencil(dst_framebuffer, src.DepthView(), src.StencilView(), + dst_region, src_region, filter, operation); + return; + } } - - // Games can sample depth or stencil values on textures. This is decided by the swizzle value on - // hardware. To emulate this on Vulkan we specify it in the aspect. - VkImageAspectFlags aspect = aspect_mask; - if (aspect == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); - const bool is_first = x_source == SwizzleSource::R; - switch (params.pixel_format) { - case VideoCore::Surface::PixelFormat::D24_UNORM_S8_UINT: - case VideoCore::Surface::PixelFormat::D32_FLOAT_S8_UINT: - aspect = is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; - break; - case VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM: - aspect = is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; - break; - default: - aspect = VK_IMAGE_ASPECT_DEPTH_BIT; - UNIMPLEMENTED(); + ASSERT(src.ImageFormat() == dst.ImageFormat()); + ASSERT(!(is_dst_msaa && !is_src_msaa)); + ASSERT(operation == Fermi2D::Operation::SrcCopy); + + const VkImage dst_image = dst.ImageHandle(); + const VkImage src_image = src.ImageHandle(); + const VkImageSubresourceLayers dst_layers = MakeSubresourceLayers(&dst); + const VkImageSubresourceLayers src_layers = MakeSubresourceLayers(&src); + const bool is_resolve = is_src_msaa && !is_dst_msaa; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([filter, dst_region, src_region, dst_image, src_image, dst_layers, src_layers, + aspect_mask, is_resolve](vk::CommandBuffer cmdbuf) { + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, read_barriers); + if (is_resolve) { + cmdbuf.ResolveImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageResolve(dst_region, src_region, dst_layers, src_layers)); + } else { + const bool is_linear = filter == Fermi2D::Filter::Bilinear; + const VkFilter vk_filter = is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; + cmdbuf.BlitImage( + src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageBlit(dst_region, src_region, dst_layers, src_layers), vk_filter); } + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, write_barrier); + }); +} - // Make sure we sample the first component - std::transform( - swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) { - return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component; - }); +void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { + switch (dst_view.format) { + case PixelFormat::R16_UNORM: + if (src_view.format == PixelFormat::D16_UNORM) { + return blit_image_helper.ConvertD16ToR16(dst, src_view); + } + break; + case PixelFormat::R32_FLOAT: + if (src_view.format == PixelFormat::D32_FLOAT) { + return blit_image_helper.ConvertD32ToR32(dst, src_view); + } + break; + case PixelFormat::D16_UNORM: + if (src_view.format == PixelFormat::R16_UNORM) { + return blit_image_helper.ConvertR16ToD16(dst, src_view); + } + break; + case PixelFormat::D32_FLOAT: + if (src_view.format == PixelFormat::R32_FLOAT) { + return blit_image_helper.ConvertR32ToD32(dst, src_view); + } + break; + default: + break; } + UNIMPLEMENTED_MSG("Unimplemented format copy from {} to {}", src_view.format, dst_view.format); +} - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - ASSERT(base_slice == 0); - ASSERT(num_slices == params.depth); - } +void TextureCacheRuntime::CopyImage(Image& dst, Image& src, + std::span<const VideoCommon::ImageCopy> copies) { + std::vector<VkImageCopy> vk_copies(copies.size()); + const VkImageAspectFlags aspect_mask = dst.AspectMask(); + ASSERT(aspect_mask == src.AspectMask()); - image_view = device.GetLogical().CreateImageView({ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = surface.GetImageHandle(), - .viewType = image_view_type, - .format = surface.GetImage().GetFormat(), - .components = - { - .r = swizzle[0], - .g = swizzle[1], - .b = swizzle[2], - .a = swizzle[3], + std::ranges::transform(copies, vk_copies.begin(), [aspect_mask](const auto& copy) { + return MakeImageCopy(copy, aspect_mask); + }); + const VkImage dst_image = dst.Handle(); + const VkImage src_image = src.Handle(); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_image, src_image, aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { + RangedBarrierRange dst_range; + RangedBarrierRange src_range; + for (const VkImageCopy& copy : vk_copies) { + dst_range.AddLayers(copy.dstSubresource); + src_range.AddLayers(copy.srcSubresource); + } + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange = src_range.SubresourceRange(aspect_mask), }, - .subresourceRange = - { - .aspectMask = aspect, - .baseMipLevel = base_level, - .levelCount = num_levels, - .baseArrayLayer = base_layer, - .layerCount = num_layers, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = dst_range.SubresourceRange(aspect_mask), }, + }; + const VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = dst_range.SubresourceRange(aspect_mask), + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, {}, {}, read_barriers); + cmdbuf.CopyImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, write_barrier); }); - - return last_image_view = *image_view; } -VkImageView CachedSurfaceView::GetAttachment() { - if (render_target) { - return *render_target; +Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, + VAddr cpu_addr_) + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, + image(MakeImage(runtime.device, info)), buffer(MakeBuffer(runtime.device, info)), + aspect_mask(ImageAspectMask(info.format)) { + if (image) { + commit = runtime.memory_allocator.Commit(image, MemoryUsage::DeviceLocal); + } else { + commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); + } + if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + } + if (runtime.device.HasDebuggingToolAttached()) { + if (image) { + image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + } else { + buffer.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + } } +} - VkImageViewCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = surface.GetImageHandle(), - .viewType = VK_IMAGE_VIEW_TYPE_1D, - .format = surface.GetImage().GetFormat(), - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, +void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { + // TODO: Move this to another API + scheduler->RequestOutsideRenderPassOperationContext(); + std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); + const VkBuffer src_buffer = map.buffer; + const VkImage vk_image = *image; + const VkImageAspectFlags vk_aspect_mask = aspect_mask; + const bool is_initialized = std::exchange(initialized, true); + scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized, + vk_copies](vk::CommandBuffer cmdbuf) { + CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies); + }); +} + +void Image::UploadMemory(const StagingBufferRef& map, + std::span<const VideoCommon::BufferCopy> copies) { + // TODO: Move this to another API + scheduler->RequestOutsideRenderPassOperationContext(); + std::vector vk_copies = TransformBufferCopies(copies, map.offset); + const VkBuffer src_buffer = map.buffer; + const VkBuffer dst_buffer = *buffer; + scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + }); +} + +void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { + std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); + scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, + vk_copies](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, }, - .subresourceRange = - { + }; + const VkImageMemoryBarrier image_write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ .aspectMask = aspect_mask, - .baseMipLevel = base_level, - .levelCount = num_levels, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, .baseArrayLayer = 0, - .layerCount = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, }, + }; + const VkMemoryBarrier memory_write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, memory_write_barrier, nullptr, image_write_barrier); + }); +} + +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, + ImageId image_id_, Image& image) + : VideoCommon::ImageViewBase{info, image.info, image_id_}, device{&runtime.device}, + image_handle{image.Handle()}, image_format{image.info.format}, samples{ConvertSampleCount( + image.info.num_samples)} { + const VkImageAspectFlags aspect_mask = ImageViewAspectMask(info); + std::array<SwizzleSource, 4> swizzle{ + SwizzleSource::R, + SwizzleSource::G, + SwizzleSource::B, + SwizzleSource::A, }; - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; - ci.subresourceRange.baseArrayLayer = base_slice; - ci.subresourceRange.layerCount = num_slices; - } else { - ci.viewType = image_view_type; - ci.subresourceRange.baseArrayLayer = base_layer; - ci.subresourceRange.layerCount = num_layers; + if (!info.IsRenderTarget()) { + swizzle = info.Swizzle(); + if ((aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) != 0) { + std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); + } + } + const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format); + const VkFormat vk_format = format_info.format; + const VkImageViewUsageCreateInfo image_view_usage{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .pNext = nullptr, + .usage = ImageUsageFlags(format_info, format), + }; + const VkImageViewCreateInfo create_info{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = &image_view_usage, + .flags = 0, + .image = image.Handle(), + .viewType = VkImageViewType{}, + .format = vk_format, + .components{ + .r = ComponentSwizzle(swizzle[0]), + .g = ComponentSwizzle(swizzle[1]), + .b = ComponentSwizzle(swizzle[2]), + .a = ComponentSwizzle(swizzle[3]), + }, + .subresourceRange = MakeSubresourceRange(aspect_mask, info.range), + }; + const auto create = [&](VideoCommon::ImageViewType view_type, std::optional<u32> num_layers) { + VkImageViewCreateInfo ci{create_info}; + ci.viewType = ImageViewType(view_type); + if (num_layers) { + ci.subresourceRange.layerCount = *num_layers; + } + vk::ImageView handle = device->GetLogical().CreateImageView(ci); + if (device->HasDebuggingToolAttached()) { + handle.SetObjectNameEXT(VideoCommon::Name(*this, view_type).c_str()); + } + image_views[static_cast<size_t>(view_type)] = std::move(handle); + }; + switch (info.type) { + case VideoCommon::ImageViewType::e1D: + case VideoCommon::ImageViewType::e1DArray: + create(VideoCommon::ImageViewType::e1D, 1); + create(VideoCommon::ImageViewType::e1DArray, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e1DArray); + break; + case VideoCommon::ImageViewType::e2D: + case VideoCommon::ImageViewType::e2DArray: + create(VideoCommon::ImageViewType::e2D, 1); + create(VideoCommon::ImageViewType::e2DArray, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e2DArray); + break; + case VideoCommon::ImageViewType::e3D: + create(VideoCommon::ImageViewType::e3D, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e3D); + break; + case VideoCommon::ImageViewType::Cube: + case VideoCommon::ImageViewType::CubeArray: + create(VideoCommon::ImageViewType::Cube, 6); + create(VideoCommon::ImageViewType::CubeArray, std::nullopt); + break; + case VideoCommon::ImageViewType::Rect: + UNIMPLEMENTED(); + break; + case VideoCommon::ImageViewType::Buffer: + buffer_view = device->GetLogical().CreateBufferView(VkBufferViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .buffer = image.Buffer(), + .format = vk_format, + .offset = 0, // TODO: Redesign buffer cache to support this + .range = image.guest_size_bytes, + }); + break; } - render_target = device.GetLogical().CreateImageView(ci); - return *render_target; } -VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool) - : TextureCache(system, rasterizer, device.IsOptimalAstcSupported()), device{device}, - resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler}, - staging_pool{staging_pool} {} - -VKTextureCache::~VKTextureCache() = default; +ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams& params) + : VideoCommon::ImageViewBase{params} {} -Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { - return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager, - scheduler, staging_pool, gpu_addr, params); +VkImageView ImageView::DepthView() { + if (depth_view) { + return *depth_view; + } + depth_view = MakeDepthStencilView(VK_IMAGE_ASPECT_DEPTH_BIT); + return *depth_view; } -void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) { - const bool src_3d = src_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; - const bool dst_3d = dst_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; - UNIMPLEMENTED_IF(src_3d); - - // The texture cache handles depth in OpenGL terms, we have to handle it as subresource and - // dimension respectively. - const u32 dst_base_layer = dst_3d ? 0 : copy_params.dest_z; - const u32 dst_offset_z = dst_3d ? copy_params.dest_z : 0; - - const u32 extent_z = dst_3d ? copy_params.depth : 1; - const u32 num_layers = dst_3d ? 1 : copy_params.depth; - - // We can't copy inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - - src_surface->Transition(copy_params.source_z, copy_params.depth, copy_params.source_level, 1, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - dst_surface->Transition(dst_base_layer, num_layers, copy_params.dest_level, 1, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - const VkImageCopy copy{ - .srcSubresource = - { - .aspectMask = src_surface->GetAspectMask(), - .mipLevel = copy_params.source_level, - .baseArrayLayer = copy_params.source_z, - .layerCount = num_layers, - }, - .srcOffset = - { - .x = static_cast<s32>(copy_params.source_x), - .y = static_cast<s32>(copy_params.source_y), - .z = 0, - }, - .dstSubresource = - { - .aspectMask = dst_surface->GetAspectMask(), - .mipLevel = copy_params.dest_level, - .baseArrayLayer = dst_base_layer, - .layerCount = num_layers, - }, - .dstOffset = - { - .x = static_cast<s32>(copy_params.dest_x), - .y = static_cast<s32>(copy_params.dest_y), - .z = static_cast<s32>(dst_offset_z), - }, - .extent = - { - .width = copy_params.width, - .height = copy_params.height, - .depth = extent_z, - }, - }; +VkImageView ImageView::StencilView() { + if (stencil_view) { + return *stencil_view; + } + stencil_view = MakeDepthStencilView(VK_IMAGE_ASPECT_STENCIL_BIT); + return *stencil_view; +} - const VkImage src_image = src_surface->GetImageHandle(); - const VkImage dst_image = dst_surface->GetImageHandle(); - scheduler.Record([src_image, dst_image, copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); +vk::ImageView ImageView::MakeDepthStencilView(VkImageAspectFlags aspect_mask) { + return device->GetLogical().CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = image_handle, + .viewType = ImageViewType(type), + .format = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format).format, + .components{ + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = MakeSubresourceRange(aspect_mask, range), }); } -void VKTextureCache::ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) { - // We can't blit inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - - src_view->Transition(VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_READ_BIT); - dst_view->Transition(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); - - VkImageBlit blit; - blit.srcSubresource = src_view->GetImageSubresourceLayers(); - blit.srcOffsets[0].x = copy_config.src_rect.left; - blit.srcOffsets[0].y = copy_config.src_rect.top; - blit.srcOffsets[0].z = 0; - blit.srcOffsets[1].x = copy_config.src_rect.right; - blit.srcOffsets[1].y = copy_config.src_rect.bottom; - blit.srcOffsets[1].z = 1; - blit.dstSubresource = dst_view->GetImageSubresourceLayers(); - blit.dstOffsets[0].x = copy_config.dst_rect.left; - blit.dstOffsets[0].y = copy_config.dst_rect.top; - blit.dstOffsets[0].z = 0; - blit.dstOffsets[1].x = copy_config.dst_rect.right; - blit.dstOffsets[1].y = copy_config.dst_rect.bottom; - blit.dstOffsets[1].z = 1; - - const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - - scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, - is_linear](vk::CommandBuffer cmdbuf) { - cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, blit, - is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST); +Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& tsc) { + const auto& device = runtime.device; + const bool arbitrary_borders = runtime.device.IsExtCustomBorderColorSupported(); + const std::array<float, 4> color = tsc.BorderColor(); + // C++20 bit_cast + VkClearColorValue border_color; + std::memcpy(&border_color, &color, sizeof(color)); + const VkSamplerCustomBorderColorCreateInfoEXT border_ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT, + .pNext = nullptr, + .customBorderColor = border_color, + .format = VK_FORMAT_UNDEFINED, + }; + const void* pnext = nullptr; + if (arbitrary_borders) { + pnext = &border_ci; + } + const VkSamplerReductionModeCreateInfoEXT reduction_ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT, + .pNext = pnext, + .reductionMode = MaxwellToVK::SamplerReduction(tsc.reduction_filter), + }; + if (runtime.device.IsExtSamplerFilterMinmaxSupported()) { + pnext = &reduction_ci; + } else if (reduction_ci.reductionMode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT) { + LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required"); + } + // Some games have samplers with garbage. Sanitize them here. + const float max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); + sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = pnext, + .flags = 0, + .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter), + .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter), + .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), + .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), + .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), + .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), + .mipLodBias = tsc.LodBias(), + .anisotropyEnable = static_cast<VkBool32>(max_anisotropy > 1.0f ? VK_TRUE : VK_FALSE), + .maxAnisotropy = max_anisotropy, + .compareEnable = tsc.depth_compare_enabled, + .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), + .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.MinLod(), + .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.MaxLod(), + .borderColor = + arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), + .unnormalizedCoordinates = VK_FALSE, }); } -void VKTextureCache::BufferCopy(Surface& src_surface, Surface& dst_surface) { - // Currently unimplemented. PBO copies should be dropped and we should use a render pass to - // convert from color to depth and viceversa. - LOG_WARNING(Render_Vulkan, "Unimplemented"); +Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { + std::vector<VkAttachmentDescription> descriptions; + std::vector<VkImageView> attachments; + RenderPassKey renderpass_key{}; + s32 num_layers = 1; + + for (size_t index = 0; index < NUM_RT; ++index) { + const ImageView* const color_buffer = color_buffers[index]; + if (!color_buffer) { + renderpass_key.color_formats[index] = PixelFormat::Invalid; + continue; + } + descriptions.push_back(AttachmentDescription(runtime.device, color_buffer)); + attachments.push_back(color_buffer->RenderTarget()); + renderpass_key.color_formats[index] = color_buffer->format; + num_layers = std::max(num_layers, color_buffer->range.extent.layers); + images[num_images] = color_buffer->ImageHandle(); + image_ranges[num_images] = MakeSubresourceRange(color_buffer); + samples = color_buffer->Samples(); + ++num_images; + } + const size_t num_colors = attachments.size(); + const VkAttachmentReference* depth_attachment = + depth_buffer ? &ATTACHMENT_REFERENCES[num_colors] : nullptr; + if (depth_buffer) { + descriptions.push_back(AttachmentDescription(runtime.device, depth_buffer)); + attachments.push_back(depth_buffer->RenderTarget()); + renderpass_key.depth_format = depth_buffer->format; + num_layers = std::max(num_layers, depth_buffer->range.extent.layers); + images[num_images] = depth_buffer->ImageHandle(); + image_ranges[num_images] = MakeSubresourceRange(depth_buffer); + samples = depth_buffer->Samples(); + ++num_images; + } else { + renderpass_key.depth_format = PixelFormat::Invalid; + } + renderpass_key.samples = samples; + + const auto& device = runtime.device.GetLogical(); + const auto [cache_pair, is_new] = runtime.renderpass_cache.try_emplace(renderpass_key); + if (is_new) { + const VkSubpassDescription subpass{ + .flags = 0, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .pInputAttachments = nullptr, + .colorAttachmentCount = static_cast<u32>(num_colors), + .pColorAttachments = num_colors != 0 ? ATTACHMENT_REFERENCES.data() : nullptr, + .pResolveAttachments = nullptr, + .pDepthStencilAttachment = depth_attachment, + .preserveAttachmentCount = 0, + .pPreserveAttachments = nullptr, + }; + cache_pair->second = device.CreateRenderPass(VkRenderPassCreateInfo{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .attachmentCount = static_cast<u32>(descriptions.size()), + .pAttachments = descriptions.data(), + .subpassCount = 1, + .pSubpasses = &subpass, + .dependencyCount = 0, + .pDependencies = nullptr, + }); + } + renderpass = *cache_pair->second; + render_area = VkExtent2D{ + .width = key.size.width, + .height = key.size.height, + }; + num_color_buffers = static_cast<u32>(num_colors); + framebuffer = device.CreateFramebuffer(VkFramebufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = renderpass, + .attachmentCount = static_cast<u32>(attachments.size()), + .pAttachments = attachments.data(), + .width = key.size.width, + .height = key.size.height, + .layers = static_cast<u32>(std::max(num_layers, 1)), + }); + if (runtime.device.HasDebuggingToolAttached()) { + framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 807e26c8a..b08c23459 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -4,225 +4,253 @@ #pragma once -#include <memory> -#include <unordered_map> - -#include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_image.h" -#include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/texture_cache/surface_base.h" -#include "video_core/texture_cache/texture_cache.h" - -namespace Core { -class System; -} +#include <compare> +#include <span> -namespace VideoCore { -class RasterizerInterface; -} +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class RasterizerVulkan; -class VKDevice; -class VKResourceManager; +using VideoCommon::ImageId; +using VideoCommon::NUM_RT; +using VideoCommon::Offset2D; +using VideoCommon::RenderTargets; +using VideoCore::Surface::PixelFormat; + +class BlitImageHelper; +class Device; +class Image; +class ImageView; +class Framebuffer; +class StagingBufferPool; class VKScheduler; -class VKStagingBufferPool; -class CachedSurfaceView; -class CachedSurface; +struct RenderPassKey { + constexpr auto operator<=>(const RenderPassKey&) const noexcept = default; -using Surface = std::shared_ptr<CachedSurface>; -using View = std::shared_ptr<CachedSurfaceView>; -using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; + std::array<PixelFormat, NUM_RT> color_formats; + PixelFormat depth_format; + VkSampleCountFlagBits samples; +}; -using VideoCommon::SurfaceParams; -using VideoCommon::ViewParams; +} // namespace Vulkan -class CachedSurface final : public VideoCommon::SurfaceBase<View> { - friend CachedSurfaceView; +namespace std { +template <> +struct hash<Vulkan::RenderPassKey> { + [[nodiscard]] constexpr size_t operator()(const Vulkan::RenderPassKey& key) const noexcept { + size_t value = static_cast<size_t>(key.depth_format) << 48; + value ^= static_cast<size_t>(key.samples) << 52; + for (size_t i = 0; i < key.color_formats.size(); ++i) { + value ^= static_cast<size_t>(key.color_formats[i]) << (i * 6); + } + return value; + } +}; +} // namespace std -public: - explicit CachedSurface(Core::System& system, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool, - GPUVAddr gpu_addr, const SurfaceParams& params); - ~CachedSurface(); +namespace Vulkan { - void UploadTexture(const std::vector<u8>& staging_buffer) override; - void DownloadTexture(std::vector<u8>& staging_buffer) override; +struct TextureCacheRuntime { + const Device& device; + VKScheduler& scheduler; + MemoryAllocator& memory_allocator; + StagingBufferPool& staging_buffer_pool; + BlitImageHelper& blit_image_helper; + std::unordered_map<RenderPassKey, vk::RenderPass> renderpass_cache{}; - void FullTransition(VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - image->Transition(0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels, - new_stage_mask, new_access, new_layout); - } + void Finish(); - void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - image->Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, - new_access, new_layout); - } + [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - VKImage& GetImage() { - return *image; - } + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); - const VKImage& GetImage() const { - return *image; - } + void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); - VkImage GetImageHandle() const { - return *image->GetHandle(); + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); + + [[nodiscard]] bool CanAccelerateImageUpload(Image&) const noexcept { + return false; } - VkImageAspectFlags GetAspectMask() const { - return image->GetAspectMask(); + void AccelerateImageUpload(Image&, const StagingBufferRef&, + std::span<const VideoCommon::SwizzleParameters>) { + UNREACHABLE(); } - VkBufferView GetBufferViewHandle() const { - return *buffer_view; + void InsertUploadMemoryBarrier() {} + + bool HasBrokenTextureViewFormats() const noexcept { + // No known Vulkan driver has broken image views + return false; } +}; -protected: - void DecorateSurfaceName(); +class Image : public VideoCommon::ImageBase { +public: + explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, + VAddr cpu_addr); - View CreateView(const ViewParams& params) override; + void UploadMemory(const StagingBufferRef& map, + std::span<const VideoCommon::BufferImageCopy> copies); -private: - void UploadBuffer(const std::vector<u8>& staging_buffer); + void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferCopy> copies); - void UploadImage(const std::vector<u8>& staging_buffer); + void DownloadMemory(const StagingBufferRef& map, + std::span<const VideoCommon::BufferImageCopy> copies); - VkBufferImageCopy GetBufferImageCopy(u32 level) const; + [[nodiscard]] VkImage Handle() const noexcept { + return *image; + } - VkImageSubresourceRange GetImageSubresourceRange() const; + [[nodiscard]] VkBuffer Buffer() const noexcept { + return *buffer; + } - Core::System& system; - const VKDevice& device; - VKResourceManager& resource_manager; - VKMemoryManager& memory_manager; - VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + [[nodiscard]] VkImageCreateFlags AspectMask() const noexcept { + return aspect_mask; + } - std::optional<VKImage> image; +private: + VKScheduler* scheduler; + vk::Image image; vk::Buffer buffer; - vk::BufferView buffer_view; - VKMemoryCommit commit; - - VkFormat format = VK_FORMAT_UNDEFINED; + MemoryCommit commit; + VkImageAspectFlags aspect_mask = 0; + bool initialized = false; }; -class CachedSurfaceView final : public VideoCommon::ViewBase { +class ImageView : public VideoCommon::ImageViewBase { public: - explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params); - ~CachedSurfaceView(); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); - VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + [[nodiscard]] VkImageView DepthView(); - VkImageView GetAttachment(); + [[nodiscard]] VkImageView StencilView(); - bool IsSameSurface(const CachedSurfaceView& rhs) const { - return &surface == &rhs.surface; + [[nodiscard]] VkImageView Handle(VideoCommon::ImageViewType query_type) const noexcept { + return *image_views[static_cast<size_t>(query_type)]; } - u32 GetWidth() const { - return params.GetMipWidth(base_level); + [[nodiscard]] VkBufferView BufferView() const noexcept { + return *buffer_view; } - u32 GetHeight() const { - return params.GetMipHeight(base_level); + [[nodiscard]] VkImage ImageHandle() const noexcept { + return image_handle; } - u32 GetNumLayers() const { - return num_layers; + [[nodiscard]] VkImageView RenderTarget() const noexcept { + return render_target; } - bool IsBufferView() const { - return buffer_view; + [[nodiscard]] PixelFormat ImageFormat() const noexcept { + return image_format; } - VkImage GetImage() const { - return image; + [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept { + return samples; } - VkBufferView GetBufferView() const { - return buffer_view; - } +private: + [[nodiscard]] vk::ImageView MakeDepthStencilView(VkImageAspectFlags aspect_mask); - VkImageSubresourceRange GetImageSubresourceRange() const { - return {aspect_mask, base_level, num_levels, base_layer, num_layers}; - } + const Device* device = nullptr; + std::array<vk::ImageView, VideoCommon::NUM_IMAGE_VIEW_TYPES> image_views; + vk::ImageView depth_view; + vk::ImageView stencil_view; + vk::BufferView buffer_view; + VkImage image_handle = VK_NULL_HANDLE; + VkImageView render_target = VK_NULL_HANDLE; + PixelFormat image_format = PixelFormat::Invalid; + VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; +}; - VkImageSubresourceLayers GetImageSubresourceLayers() const { - return {surface.GetAspectMask(), base_level, base_layer, num_layers}; - } +class ImageAlloc : public VideoCommon::ImageAllocBase {}; - void Transition(VkImageLayout new_layout, VkPipelineStageFlags new_stage_mask, - VkAccessFlags new_access) const { - surface.Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, - new_access, new_layout); - } +class Sampler { +public: + explicit Sampler(TextureCacheRuntime&, const Tegra::Texture::TSCEntry&); - void MarkAsModified(u64 tick) { - surface.MarkAsModified(true, tick); + [[nodiscard]] VkSampler Handle() const noexcept { + return *sampler; } private: - // Store a copy of these values to avoid double dereference when reading them - const SurfaceParams params; - const VkImage image; - const VkBufferView buffer_view; - const VkImageAspectFlags aspect_mask; - - const VKDevice& device; - CachedSurface& surface; - const u32 base_level; - const u32 num_levels; - const VkImageViewType image_view_type; - u32 base_layer = 0; - u32 num_layers = 0; - u32 base_slice = 0; - u32 num_slices = 0; - - VkImageView last_image_view = nullptr; - u32 last_swizzle = 0; - - vk::ImageView render_target; - std::unordered_map<u32, vk::ImageView> view_cache; + vk::Sampler sampler; }; -class VKTextureCache final : public TextureCacheBase { +class Framebuffer { public: - explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool); - ~VKTextureCache(); + explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key); -private: - Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; + [[nodiscard]] VkFramebuffer Handle() const noexcept { + return *framebuffer; + } - void ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) override; + [[nodiscard]] VkRenderPass RenderPass() const noexcept { + return renderpass; + } - void ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) override; + [[nodiscard]] VkExtent2D RenderArea() const noexcept { + return render_area; + } - void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept { + return samples; + } - const VKDevice& device; - VKResourceManager& resource_manager; - VKMemoryManager& memory_manager; - VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + [[nodiscard]] u32 NumColorBuffers() const noexcept { + return num_color_buffers; + } + + [[nodiscard]] u32 NumImages() const noexcept { + return num_images; + } + + [[nodiscard]] const std::array<VkImage, 9>& Images() const noexcept { + return images; + } + + [[nodiscard]] const std::array<VkImageSubresourceRange, 9>& ImageRanges() const noexcept { + return image_ranges; + } + +private: + vk::Framebuffer framebuffer; + VkRenderPass renderpass{}; + VkExtent2D render_area{}; + VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; + u32 num_color_buffers = 0; + u32 num_images = 0; + std::array<VkImage, 9> images{}; + std::array<VkImageSubresourceRange, 9> image_ranges{}; }; +struct TextureCacheParams { + static constexpr bool ENABLE_VALIDATION = true; + static constexpr bool FRAMEBUFFER_BLITS = false; + static constexpr bool HAS_EMULATED_COPIES = false; + + using Runtime = Vulkan::TextureCacheRuntime; + using Image = Vulkan::Image; + using ImageAlloc = Vulkan::ImageAlloc; + using ImageView = Vulkan::ImageView; + using Sampler = Vulkan::Sampler; + using Framebuffer = Vulkan::Framebuffer; +}; + +using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 351c048d2..f99273c6a 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -7,15 +7,15 @@ #include "common/assert.h" #include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -VKUpdateDescriptorQueue::VKUpdateDescriptorQueue(const VKDevice& device, VKScheduler& scheduler) - : device{device}, scheduler{scheduler} {} +VKUpdateDescriptorQueue::VKUpdateDescriptorQueue(const Device& device_, VKScheduler& scheduler_) + : device{device_}, scheduler{scheduler_} {} VKUpdateDescriptorQueue::~VKUpdateDescriptorQueue() = default; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index 945320c72..e214f7195 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -8,11 +8,11 @@ #include <boost/container/static_vector.hpp> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; struct DescriptorUpdateEntry { @@ -31,7 +31,7 @@ struct DescriptorUpdateEntry { class VKUpdateDescriptorQueue final { public: - explicit VKUpdateDescriptorQueue(const VKDevice& device, VKScheduler& scheduler); + explicit VKUpdateDescriptorQueue(const Device& device_, VKScheduler& scheduler_); ~VKUpdateDescriptorQueue(); void TickFrame(); @@ -40,32 +40,36 @@ public: void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set); - void AddSampledImage(VkSampler sampler, VkImageView image_view) { - payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); + void AddSampledImage(VkImageView image_view, VkSampler sampler) { + payload.emplace_back(VkDescriptorImageInfo{ + .sampler = sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }); } void AddImage(VkImageView image_view) { - payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{ + .sampler = VK_NULL_HANDLE, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }); } - void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { - payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); + void AddBuffer(VkBuffer buffer, u64 offset, size_t size) { + payload.emplace_back(VkDescriptorBufferInfo{ + .buffer = buffer, + .offset = offset, + .range = size, + }); } void AddTexelBuffer(VkBufferView texel_buffer) { payload.emplace_back(texel_buffer); } - VkImageLayout* LastImageLayout() { - return &payload.back().image.imageLayout; - } - - const VkImageLayout* LastImageLayout() const { - return &payload.back().image.imageLayout; - } - private: - const VKDevice& device; + const Device& device; VKScheduler& scheduler; const DescriptorUpdateEntry* upload_start = nullptr; diff --git a/src/video_core/sampler_cache.cpp b/src/video_core/sampler_cache.cpp deleted file mode 100644 index 53c7ef12d..000000000 --- a/src/video_core/sampler_cache.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/cityhash.h" -#include "common/common_types.h" -#include "video_core/sampler_cache.h" - -namespace VideoCommon { - -std::size_t SamplerCacheKey::Hash() const { - static_assert(sizeof(raw) % sizeof(u64) == 0); - return static_cast<std::size_t>( - Common::CityHash64(reinterpret_cast<const char*>(raw.data()), sizeof(raw) / sizeof(u64))); -} - -bool SamplerCacheKey::operator==(const SamplerCacheKey& rhs) const { - return raw == rhs.raw; -} - -} // namespace VideoCommon diff --git a/src/video_core/sampler_cache.h b/src/video_core/sampler_cache.h deleted file mode 100644 index cbe3ad071..000000000 --- a/src/video_core/sampler_cache.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <cstddef> -#include <unordered_map> - -#include "video_core/textures/texture.h" - -namespace VideoCommon { - -struct SamplerCacheKey final : public Tegra::Texture::TSCEntry { - std::size_t Hash() const; - - bool operator==(const SamplerCacheKey& rhs) const; - - bool operator!=(const SamplerCacheKey& rhs) const { - return !operator==(rhs); - } -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::SamplerCacheKey> { - std::size_t operator()(const VideoCommon::SamplerCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace VideoCommon { - -template <typename SamplerType, typename SamplerStorageType> -class SamplerCache { -public: - SamplerType GetSampler(const Tegra::Texture::TSCEntry& tsc) { - const auto [entry, is_cache_miss] = cache.try_emplace(SamplerCacheKey{tsc}); - auto& sampler = entry->second; - if (is_cache_miss) { - sampler = CreateSampler(tsc); - } - return ToSamplerType(sampler); - } - -protected: - virtual SamplerStorageType CreateSampler(const Tegra::Texture::TSCEntry& tsc) const = 0; - - virtual SamplerType ToSamplerType(const SamplerStorageType& sampler) const = 0; - -private: - std::unordered_map<SamplerCacheKey, SamplerStorageType> cache; -}; - -} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/shader/ast.cpp b/src/video_core/shader/ast.cpp index 3f96d9076..db11144c7 100644 --- a/src/video_core/shader/ast.cpp +++ b/src/video_core/shader/ast.cpp @@ -212,16 +212,15 @@ public: } void operator()(const ExprPredicate& expr) { - inner += "P" + std::to_string(expr.predicate); + inner += fmt::format("P{}", expr.predicate); } void operator()(const ExprCondCode& expr) { - u32 cc = static_cast<u32>(expr.cc); - inner += "CC" + std::to_string(cc); + inner += fmt::format("CC{}", expr.cc); } void operator()(const ExprVar& expr) { - inner += "V" + std::to_string(expr.var_index); + inner += fmt::format("V{}", expr.var_index); } void operator()(const ExprBoolean& expr) { @@ -229,7 +228,7 @@ public: } void operator()(const ExprGprEqual& expr) { - inner += "( gpr_" + std::to_string(expr.gpr) + " == " + std::to_string(expr.value) + ')'; + inner += fmt::format("(gpr_{} == {})", expr.gpr, expr.value); } const std::string& GetResult() const { @@ -374,8 +373,8 @@ std::string ASTManager::Print() const { return printer.GetResult(); } -ASTManager::ASTManager(bool full_decompile, bool disable_else_derivation) - : full_decompile{full_decompile}, disable_else_derivation{disable_else_derivation} {}; +ASTManager::ASTManager(bool do_full_decompile, bool disable_else_derivation_) + : full_decompile{do_full_decompile}, disable_else_derivation{disable_else_derivation_} {} ASTManager::~ASTManager() { Clear(); diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h index cca13bcde..dc49b369e 100644 --- a/src/video_core/shader/ast.h +++ b/src/video_core/shader/ast.h @@ -76,7 +76,7 @@ public: class ASTIfThen { public: - explicit ASTIfThen(Expr condition) : condition{std::move(condition)} {} + explicit ASTIfThen(Expr condition_) : condition{std::move(condition_)} {} Expr condition; ASTZipper nodes{}; }; @@ -88,63 +88,68 @@ public: class ASTBlockEncoded { public: - explicit ASTBlockEncoded(u32 start, u32 end) : start{start}, end{end} {} + explicit ASTBlockEncoded(u32 start_, u32 _) : start{start_}, end{_} {} u32 start; u32 end; }; class ASTBlockDecoded { public: - explicit ASTBlockDecoded(NodeBlock&& new_nodes) : nodes(std::move(new_nodes)) {} + explicit ASTBlockDecoded(NodeBlock&& new_nodes_) : nodes(std::move(new_nodes_)) {} NodeBlock nodes; }; class ASTVarSet { public: - explicit ASTVarSet(u32 index, Expr condition) : index{index}, condition{std::move(condition)} {} + explicit ASTVarSet(u32 index_, Expr condition_) + : index{index_}, condition{std::move(condition_)} {} + u32 index; Expr condition; }; class ASTLabel { public: - explicit ASTLabel(u32 index) : index{index} {} + explicit ASTLabel(u32 index_) : index{index_} {} u32 index; bool unused{}; }; class ASTGoto { public: - explicit ASTGoto(Expr condition, u32 label) : condition{std::move(condition)}, label{label} {} + explicit ASTGoto(Expr condition_, u32 label_) + : condition{std::move(condition_)}, label{label_} {} + Expr condition; u32 label; }; class ASTDoWhile { public: - explicit ASTDoWhile(Expr condition) : condition{std::move(condition)} {} + explicit ASTDoWhile(Expr condition_) : condition{std::move(condition_)} {} Expr condition; ASTZipper nodes{}; }; class ASTReturn { public: - explicit ASTReturn(Expr condition, bool kills) - : condition{std::move(condition)}, kills{kills} {} + explicit ASTReturn(Expr condition_, bool kills_) + : condition{std::move(condition_)}, kills{kills_} {} + Expr condition; bool kills; }; class ASTBreak { public: - explicit ASTBreak(Expr condition) : condition{std::move(condition)} {} + explicit ASTBreak(Expr condition_) : condition{std::move(condition_)} {} Expr condition; }; class ASTBase { public: - explicit ASTBase(ASTNode parent, ASTData data) - : data{std::move(data)}, parent{std::move(parent)} {} + explicit ASTBase(ASTNode parent_, ASTData data_) + : data{std::move(data_)}, parent{std::move(parent_)} {} template <class U, class... Args> static ASTNode Make(ASTNode parent, Args&&... args) { @@ -199,55 +204,48 @@ public: } std::optional<u32> GetGotoLabel() const { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTGoto>(&data)) { return {inner->label}; } - return {}; + return std::nullopt; } Expr GetGotoCondition() const { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTGoto>(&data)) { return inner->condition; } return nullptr; } void MarkLabelUnused() { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (auto* inner = std::get_if<ASTLabel>(&data)) { inner->unused = true; } } bool IsLabelUnused() const { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTLabel>(&data)) { return inner->unused; } return true; } std::optional<u32> GetLabelIndex() const { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTLabel>(&data)) { return {inner->index}; } - return {}; + return std::nullopt; } Expr GetIfCondition() const { - auto inner = std::get_if<ASTIfThen>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTIfThen>(&data)) { return inner->condition; } return nullptr; } void SetGotoCondition(Expr new_condition) { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (auto* inner = std::get_if<ASTGoto>(&data)) { inner->condition = std::move(new_condition); } } @@ -307,7 +305,7 @@ private: class ASTManager final { public: - ASTManager(bool full_decompile, bool disable_else_derivation); + explicit ASTManager(bool do_full_decompile, bool disable_else_derivation_); ~ASTManager(); ASTManager(const ASTManager& o) = delete; diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp index f815584f7..02adcf9c7 100644 --- a/src/video_core/shader/async_shaders.cpp +++ b/src/video_core/shader/async_shaders.cpp @@ -13,21 +13,22 @@ namespace VideoCommon::Shader { -AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window) : emu_window(emu_window) {} +AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window_) : emu_window(emu_window_) {} AsyncShaders::~AsyncShaders() { KillWorkers(); } void AsyncShaders::AllocateWorkers() { - // Max worker threads we should allow - constexpr u32 MAX_THREADS = 4; - // Deduce how many threads we can use - const u32 threads_used = std::thread::hardware_concurrency() / 4; - // Always allow at least 1 thread regardless of our settings - const auto max_worker_count = std::max(1U, threads_used); - // Don't use more than MAX_THREADS - const auto num_workers = std::min(max_worker_count, MAX_THREADS); + // Use at least one thread + u32 num_workers = 1; + + // Deduce how many more threads we can use + const u32 thread_count = std::thread::hardware_concurrency(); + if (thread_count >= 8) { + // Increase async workers by 1 for every 2 threads >= 8 + num_workers += 1 + (thread_count - 8) / 2; + } // If we already have workers queued, ignore if (num_workers == worker_threads.size()) { @@ -42,8 +43,8 @@ void AsyncShaders::AllocateWorkers() { // Create workers for (std::size_t i = 0; i < num_workers; i++) { context_list.push_back(emu_window.CreateSharedContext()); - worker_threads.push_back( - std::thread(&AsyncShaders::ShaderCompilerThread, this, context_list[i].get())); + worker_threads.emplace_back(&AsyncShaders::ShaderCompilerThread, this, + context_list[i].get()); } } @@ -63,6 +64,7 @@ void AsyncShaders::FreeWorkers() { void AsyncShaders::KillWorkers() { is_thread_exiting.store(true); + cv.notify_all(); for (auto& thread : worker_threads) { thread.detach(); } @@ -73,11 +75,11 @@ void AsyncShaders::KillWorkers() { worker_threads.clear(); } -bool AsyncShaders::HasWorkQueued() { +bool AsyncShaders::HasWorkQueued() const { return !pending_queue.empty(); } -bool AsyncShaders::HasCompletedWork() { +bool AsyncShaders::HasCompletedWork() const { std::shared_lock lock{completed_mutex}; return !finished_work.empty(); } @@ -102,11 +104,10 @@ bool AsyncShaders::IsShaderAsync(const Tegra::GPU& gpu) const { } std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() { - std::vector<AsyncShaders::Result> results; + std::vector<Result> results; { std::unique_lock lock{completed_mutex}; - results.assign(std::make_move_iterator(finished_work.begin()), - std::make_move_iterator(finished_work.end())); + results = std::move(finished_work); finished_work.clear(); } return results; @@ -115,11 +116,10 @@ std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() { void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type, u64 uid, std::vector<u64> code, std::vector<u64> code_b, - u32 main_offset, - VideoCommon::Shader::CompilerSettings compiler_settings, - const VideoCommon::Shader::Registry& registry, - VAddr cpu_addr) { - WorkerParams params{ + u32 main_offset, CompilerSettings compiler_settings, + const Registry& registry, VAddr cpu_addr) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ .backend = device.UseAssemblyShaders() ? Backend::GLASM : Backend::OpenGL, .device = &device, .shader_type = shader_type, @@ -130,35 +130,48 @@ void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, .compiler_settings = compiler_settings, .registry = registry, .cpu_address = cpu_addr, - }; - std::unique_lock lock(queue_mutex); - pending_queue.push(std::move(params)); + .pp_cache = nullptr, + .vk_device = nullptr, + .scheduler = nullptr, + .descriptor_pool = nullptr, + .update_descriptor_queue = nullptr, + .bindings{}, + .program{}, + .key{}, + .num_color_buffers = 0, + }); cv.notify_one(); } void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, - const Vulkan::VKDevice& device, Vulkan::VKScheduler& scheduler, + const Vulkan::Device& device, Vulkan::VKScheduler& scheduler, Vulkan::VKDescriptorPool& descriptor_pool, Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, - Vulkan::VKRenderPassCache& renderpass_cache, std::vector<VkDescriptorSetLayoutBinding> bindings, Vulkan::SPIRVProgram program, - Vulkan::GraphicsPipelineCacheKey key) { - WorkerParams params{ + Vulkan::GraphicsPipelineCacheKey key, u32 num_color_buffers) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ .backend = Backend::Vulkan, + .device = nullptr, + .shader_type{}, + .uid = 0, + .code{}, + .code_b{}, + .main_offset = 0, + .compiler_settings{}, + .registry{}, + .cpu_address = 0, .pp_cache = pp_cache, .vk_device = &device, .scheduler = &scheduler, .descriptor_pool = &descriptor_pool, .update_descriptor_queue = &update_descriptor_queue, - .renderpass_cache = &renderpass_cache, - .bindings = bindings, - .program = program, + .bindings = std::move(bindings), + .program = std::move(program), .key = key, - }; - - std::unique_lock lock(queue_mutex); - pending_queue.push(std::move(params)); + .num_color_buffers = num_color_buffers, + }); cv.notify_one(); } @@ -210,8 +223,8 @@ void AsyncShaders::ShaderCompilerThread(Core::Frontend::GraphicsContext* context } else if (work.backend == Backend::Vulkan) { auto pipeline = std::make_unique<Vulkan::VKGraphicsPipeline>( *work.vk_device, *work.scheduler, *work.descriptor_pool, - *work.update_descriptor_queue, *work.renderpass_cache, work.key, work.bindings, - work.program); + *work.update_descriptor_queue, work.key, work.bindings, work.program, + work.num_color_buffers); work.pp_cache->EmplacePipeline(std::move(pipeline)); } diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h index d5ae814d5..7fdff6e56 100644 --- a/src/video_core/shader/async_shaders.h +++ b/src/video_core/shader/async_shaders.h @@ -5,19 +5,19 @@ #pragma once #include <condition_variable> -#include <deque> #include <memory> #include <shared_mutex> #include <thread> -#include "common/bit_field.h" + +#include <glad/glad.h> + #include "common/common_types.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/vulkan_common/vulkan_device.h" namespace Core::Frontend { class EmuWindow; @@ -57,7 +57,7 @@ public: Tegra::Engines::ShaderType shader_type; }; - explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window); + explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window_); ~AsyncShaders(); /// Start up shader worker threads @@ -70,34 +70,34 @@ public: void KillWorkers(); /// Check to see if any shaders have actually been compiled - bool HasCompletedWork(); + [[nodiscard]] bool HasCompletedWork() const; /// Deduce if a shader can be build on another thread of MUST be built in sync. We cannot build /// every shader async as some shaders are only built and executed once. We try to "guess" which /// shader would be used only once - bool IsShaderAsync(const Tegra::GPU& gpu) const; + [[nodiscard]] bool IsShaderAsync(const Tegra::GPU& gpu) const; /// Pulls completed compiled shaders - std::vector<Result> GetCompletedWork(); + [[nodiscard]] std::vector<Result> GetCompletedWork(); void QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type, u64 uid, std::vector<u64> code, std::vector<u64> code_b, u32 main_offset, - VideoCommon::Shader::CompilerSettings compiler_settings, - const VideoCommon::Shader::Registry& registry, VAddr cpu_addr); + CompilerSettings compiler_settings, const Registry& registry, + VAddr cpu_addr); - void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::VKDevice& device, + void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::Device& device, Vulkan::VKScheduler& scheduler, Vulkan::VKDescriptorPool& descriptor_pool, Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, - Vulkan::VKRenderPassCache& renderpass_cache, std::vector<VkDescriptorSetLayoutBinding> bindings, - Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key); + Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key, + u32 num_color_buffers); private: void ShaderCompilerThread(Core::Frontend::GraphicsContext* context); /// Check our worker queue to see if we have any work queued already - bool HasWorkQueued(); + [[nodiscard]] bool HasWorkQueued() const; struct WorkerParams { Backend backend; @@ -108,30 +108,30 @@ private: std::vector<u64> code; std::vector<u64> code_b; u32 main_offset; - VideoCommon::Shader::CompilerSettings compiler_settings; - std::optional<VideoCommon::Shader::Registry> registry; + CompilerSettings compiler_settings; + std::optional<Registry> registry; VAddr cpu_address; // For Vulkan Vulkan::VKPipelineCache* pp_cache; - const Vulkan::VKDevice* vk_device; + const Vulkan::Device* vk_device; Vulkan::VKScheduler* scheduler; Vulkan::VKDescriptorPool* descriptor_pool; Vulkan::VKUpdateDescriptorQueue* update_descriptor_queue; - Vulkan::VKRenderPassCache* renderpass_cache; std::vector<VkDescriptorSetLayoutBinding> bindings; Vulkan::SPIRVProgram program; Vulkan::GraphicsPipelineCacheKey key; + u32 num_color_buffers; }; std::condition_variable cv; - std::mutex queue_mutex; - std::shared_mutex completed_mutex; + mutable std::mutex queue_mutex; + mutable std::shared_mutex completed_mutex; std::atomic<bool> is_thread_exiting{}; std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> context_list; std::vector<std::thread> worker_threads; std::queue<WorkerParams> pending_queue; - std::vector<AsyncShaders::Result> finished_work; + std::vector<Result> finished_work; Core::Frontend::EmuWindow& emu_window; }; diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 336397cdb..43d965f2f 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -66,8 +66,8 @@ struct BlockInfo { }; struct CFGRebuildState { - explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry) - : program_code{program_code}, registry{registry}, start{start} {} + explicit CFGRebuildState(const ProgramCode& program_code_, u32 start_, Registry& registry_) + : program_code{program_code_}, registry{registry_}, start{start_} {} const ProgramCode& program_code; Registry& registry; @@ -241,10 +241,10 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) ParseInfo parse_info{}; SingleBranch single_branch{}; - const auto insert_label = [](CFGRebuildState& state, u32 address) { - const auto pair = state.labels.emplace(address); + const auto insert_label = [](CFGRebuildState& rebuild_state, u32 label_address) { + const auto pair = rebuild_state.labels.emplace(label_address); if (pair.second) { - state.inspect_queries.push_back(address); + rebuild_state.inspect_queries.push_back(label_address); } }; @@ -257,7 +257,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) single_branch.ignore = false; break; } - if (state.registered.count(offset) != 0) { + if (state.registered.contains(offset)) { single_branch.address = offset; single_branch.ignore = true; break; @@ -547,13 +547,13 @@ bool TryQuery(CFGRebuildState& state) { gather_labels(q2.ssy_stack, state.ssy_labels, block); gather_labels(q2.pbk_stack, state.pbk_labels, block); if (std::holds_alternative<SingleBranch>(*block.branch)) { - const auto branch = std::get_if<SingleBranch>(block.branch.get()); + auto* branch = std::get_if<SingleBranch>(block.branch.get()); if (!branch->condition.IsUnconditional()) { q2.address = block.end + 1; state.queries.push_back(q2); } - Query conditional_query{q2}; + auto& conditional_query = state.queries.emplace_back(q2); if (branch->is_sync) { if (branch->address == unassigned_branch) { branch->address = conditional_query.ssy_stack.top(); @@ -567,21 +567,21 @@ bool TryQuery(CFGRebuildState& state) { conditional_query.pbk_stack.pop(); } conditional_query.address = branch->address; - state.queries.push_back(std::move(conditional_query)); return true; } - const auto multi_branch = std::get_if<MultiBranch>(block.branch.get()); + + const auto* multi_branch = std::get_if<MultiBranch>(block.branch.get()); for (const auto& branch_case : multi_branch->branches) { - Query conditional_query{q2}; + auto& conditional_query = state.queries.emplace_back(q2); conditional_query.address = branch_case.address; - state.queries.push_back(std::move(conditional_query)); } + return true; } void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { - const auto get_expr = ([&](const Condition& cond) -> Expr { - Expr result{}; + const auto get_expr = [](const Condition& cond) -> Expr { + Expr result; if (cond.cc != ConditionCode::T) { result = MakeExpr<ExprCondCode>(cond.cc); } @@ -594,10 +594,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { } Expr extra = MakeExpr<ExprPredicate>(pred); if (negate) { - extra = MakeExpr<ExprNot>(extra); + extra = MakeExpr<ExprNot>(std::move(extra)); } if (result) { - return MakeExpr<ExprAnd>(extra, result); + return MakeExpr<ExprAnd>(std::move(extra), std::move(result)); } return extra; } @@ -605,9 +605,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { return result; } return MakeExpr<ExprBoolean>(true); - }); + }; + if (std::holds_alternative<SingleBranch>(*branch_info)) { - const auto branch = std::get_if<SingleBranch>(branch_info.get()); + const auto* branch = std::get_if<SingleBranch>(branch_info.get()); if (branch->address < 0) { if (branch->kill) { mm.InsertReturn(get_expr(branch->condition), true); @@ -619,7 +620,7 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { mm.InsertGoto(get_expr(branch->condition), branch->address); return; } - const auto multi_branch = std::get_if<MultiBranch>(branch_info.get()); + const auto* multi_branch = std::get_if<MultiBranch>(branch_info.get()); for (const auto& branch_case : multi_branch->branches) { mm.InsertGoto(MakeExpr<ExprGprEqual>(multi_branch->gpr, branch_case.cmp_value), branch_case.address); @@ -631,12 +632,12 @@ void DecompileShader(CFGRebuildState& state) { for (auto label : state.labels) { state.manager->DeclareLabel(label); } - for (auto& block : state.block_info) { - if (state.labels.count(block.start) != 0) { + for (const auto& block : state.block_info) { + if (state.labels.contains(block.start)) { state.manager->InsertLabel(block.start); } const bool ignore = BlockBranchIsIgnored(block.branch); - u32 end = ignore ? block.end + 1 : block.end; + const u32 end = ignore ? block.end + 1 : block.end; state.manager->InsertBlock(block.start, end); if (!ignore) { InsertBranch(*state.manager, block.branch); @@ -736,7 +737,7 @@ std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, auto back = result_out->blocks.begin(); auto next = std::next(back); while (next != result_out->blocks.end()) { - if (state.labels.count(next->start) == 0 && next->start == back->end + 1) { + if (!state.labels.contains(next->start) && next->start == back->end + 1) { back->end = next->end; next = result_out->blocks.erase(next); continue; diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index 62a3510d8..37bf96492 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -42,10 +42,10 @@ struct Condition { class SingleBranch { public: SingleBranch() = default; - SingleBranch(Condition condition, s32 address, bool kill, bool is_sync, bool is_brk, - bool ignore) - : condition{condition}, address{address}, kill{kill}, is_sync{is_sync}, is_brk{is_brk}, - ignore{ignore} {} + explicit SingleBranch(Condition condition_, s32 address_, bool kill_, bool is_sync_, + bool is_brk_, bool ignore_) + : condition{condition_}, address{address_}, kill{kill_}, is_sync{is_sync_}, is_brk{is_brk_}, + ignore{ignore_} {} bool operator==(const SingleBranch& b) const { return std::tie(condition, address, kill, is_sync, is_brk, ignore) == @@ -65,15 +65,15 @@ public: }; struct CaseBranch { - CaseBranch(u32 cmp_value, u32 address) : cmp_value{cmp_value}, address{address} {} + explicit CaseBranch(u32 cmp_value_, u32 address_) : cmp_value{cmp_value_}, address{address_} {} u32 cmp_value; u32 address; }; class MultiBranch { public: - MultiBranch(u32 gpr, std::vector<CaseBranch>&& branches) - : gpr{gpr}, branches{std::move(branches)} {} + explicit MultiBranch(u32 gpr_, std::vector<CaseBranch>&& branches_) + : gpr{gpr_}, branches{std::move(branches_)} {} u32 gpr{}; std::vector<CaseBranch> branches{}; diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index eeac328a6..6576d1208 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -25,7 +25,7 @@ using Tegra::Shader::OpCode; namespace { void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, - const std::list<Sampler>& used_samplers) { + const std::list<SamplerEntry>& used_samplers) { if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) { return; } @@ -43,9 +43,9 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, } } -std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, +std::optional<u32> TryDeduceSamplerSize(const SamplerEntry& sampler_to_deduce, VideoCore::GuestDriverProfile& gpu_driver, - const std::list<Sampler>& used_samplers) { + const std::list<SamplerEntry>& used_samplers) { const u32 base_offset = sampler_to_deduce.offset; u32 max_offset{std::numeric_limits<u32>::max()}; for (const auto& sampler : used_samplers) { @@ -66,7 +66,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, class ASTDecoder { public: - ASTDecoder(ShaderIR& ir) : ir(ir) {} + explicit ASTDecoder(ShaderIR& ir_) : ir(ir_) {} void operator()(ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); @@ -153,8 +153,8 @@ void ShaderIR::Decode() { const auto& blocks = shader_info.blocks; NodeBlock current_block; u32 current_label = static_cast<u32>(exit_branch); - for (auto& block : blocks) { - if (shader_info.labels.count(block.start) != 0) { + for (const auto& block : blocks) { + if (shader_info.labels.contains(block.start)) { insert_block(current_block, current_label); current_block.clear(); current_label = block.start; diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 4db329fa5..15eb700e7 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -110,8 +110,7 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case SubOp::Sqrt: return Operation(OperationCode::FSqrt, PRECISE, op_a); default: - UNIMPLEMENTED_MSG("Unhandled MUFU sub op={0:x}", - static_cast<unsigned>(instr.sub_op.Value())); + UNIMPLEMENTED_MSG("Unhandled MUFU sub op={0:x}", instr.sub_op.Value()); return Immediate(0); } }(); @@ -137,7 +136,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::FCMP_RR: - case OpCode::Id::FCMP_RC: { + case OpCode::Id::FCMP_RC: + case OpCode::Id::FCMP_IMMR: { UNIMPLEMENTED_IF(instr.fcmp.ftz == 0); Node op_c = GetRegister(instr.gpr39); Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f)); diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp index a276aee44..88103fede 100644 --- a/src/video_core/shader/decode/arithmetic_half.cpp +++ b/src/video_core/shader/decode/arithmetic_half.cpp @@ -53,6 +53,9 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { absolute_a = ((instr.value >> 44) & 1) != 0; absolute_b = ((instr.value >> 54) & 1) != 0; break; + default: + UNREACHABLE(); + break; } Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a); diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index 73155966f..7b5bb7003 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -83,7 +83,7 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { case IAdd3Height::UpperHalfWord: return BitfieldExtract(value, 16, 16); default: - UNIMPLEMENTED_MSG("Unhandled IADD3 height: {}", static_cast<u32>(height)); + UNIMPLEMENTED_MSG("Unhandled IADD3 height: {}", height); return Immediate(0); } }; @@ -258,7 +258,7 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { case OpCode::Id::LEA_IMM: case OpCode::Id::LEA_RZ: case OpCode::Id::LEA_HI: { - auto [op_a, op_b, op_c] = [&]() -> std::tuple<Node, Node, Node> { + auto [op_a_, op_b_, op_c_] = [&]() -> std::tuple<Node, Node, Node> { switch (opcode->get().GetId()) { case OpCode::Id::LEA_R2: { return {GetRegister(instr.gpr20), GetRegister(instr.gpr39), @@ -294,8 +294,9 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex), "Unhandled LEA Predicate"); - Node value = Operation(OperationCode::ILogicalShiftLeft, std::move(op_a), std::move(op_c)); - value = Operation(OperationCode::IAdd, std::move(op_b), std::move(value)); + Node value = + Operation(OperationCode::ILogicalShiftLeft, std::move(op_a_), std::move(op_c_)); + value = Operation(OperationCode::IAdd, std::move(op_b_), std::move(value)); SetRegister(bb, instr.gpr0, std::move(value)); break; diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp index 73880db0e..73580277a 100644 --- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp @@ -28,23 +28,26 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { case OpCode::Id::IADD32I: { UNIMPLEMENTED_IF_MSG(instr.iadd32i.saturate, "IADD32I saturation is not implemented"); - op_a = GetOperandAbsNegInteger(op_a, false, instr.iadd32i.negate_a, true); + op_a = GetOperandAbsNegInteger(std::move(op_a), false, instr.iadd32i.negate_a != 0, true); - const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b); + Node value = Operation(OperationCode::IAdd, PRECISE, std::move(op_a), std::move(op_b)); - SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc); - SetRegister(bb, instr.gpr0, value); + SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc != 0); + SetRegister(bb, instr.gpr0, std::move(value)); break; } case OpCode::Id::LOP32I: { - if (instr.alu.lop32i.invert_a) - op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_a); + if (instr.alu.lop32i.invert_a) { + op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_a)); + } - if (instr.alu.lop32i.invert_b) - op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_b); + if (instr.alu.lop32i.invert_b) { + op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b)); + } - WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, op_a, op_b, - PredicateResultMode::None, Pred::UnusedIndex, instr.op_32.generates_cc); + WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, std::move(op_a), + std::move(op_b), PredicateResultMode::None, Pred::UnusedIndex, + instr.op_32.generates_cc != 0); break; } default: @@ -58,18 +61,18 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation logic_op, Node op_a, Node op_b, PredicateResultMode predicate_mode, Pred predicate, bool sets_cc) { - const Node result = [&]() { + Node result = [&] { switch (logic_op) { case LogicOperation::And: - return Operation(OperationCode::IBitwiseAnd, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseAnd, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Or: - return Operation(OperationCode::IBitwiseOr, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseOr, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Xor: - return Operation(OperationCode::IBitwiseXor, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseXor, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::PassB: return op_b; default: - UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(logic_op)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", logic_op); return Immediate(0); } }(); @@ -84,13 +87,12 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation return; case PredicateResultMode::NotZero: { // Set the predicate to true if the result is not zero. - const Node compare = Operation(OperationCode::LogicalINotEqual, result, Immediate(0)); - SetPredicate(bb, static_cast<u64>(predicate), compare); + Node compare = Operation(OperationCode::LogicalINotEqual, std::move(result), Immediate(0)); + SetPredicate(bb, static_cast<u64>(predicate), std::move(compare)); break; } default: - UNIMPLEMENTED_MSG("Unimplemented predicate result mode: {}", - static_cast<u32>(predicate_mode)); + UNIMPLEMENTED_MSG("Unimplemented predicate result mode: {}", predicate_mode); } } diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index b9989c88c..fea7a54df 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -244,7 +244,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Operation(OperationCode::FTrunc, value); default: UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", - static_cast<u32>(instr.conversion.f2f.rounding.Value())); + instr.conversion.f2f.rounding.Value()); return value; } }(); @@ -300,7 +300,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Operation(OperationCode::FTrunc, PRECISE, value); default: UNIMPLEMENTED_MSG("Unimplemented F2I rounding mode {}", - static_cast<u32>(instr.conversion.f2i.rounding.Value())); + instr.conversion.f2i.rounding.Value()); return Immediate(0); } }(); diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index b2e88fa20..fa83108cd 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -22,13 +22,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - PredCondition cond; - bool bf; - bool ftz; - bool neg_a; - bool abs_a; - bool neg_b; - bool abs_b; + PredCondition cond{}; + bool bf = false; + bool ftz = false; + bool neg_a = false; + bool abs_a = false; + bool neg_b = false; + bool abs_b = false; switch (opcode->get().GetId()) { case OpCode::Id::HSET2_C: case OpCode::Id::HSET2_IMM: diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index e75ca4fdb..5470e8cf4 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -119,6 +119,8 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, return descriptor.r_type; } break; + default: + break; } UNIMPLEMENTED_MSG("Texture format not implemented={}", format); return ComponentType::FLOAT; @@ -210,19 +212,20 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 0; case TextureFormat::R8G24: if (component == 0) { - return 8; + return 24; } if (component == 1) { - return 24; + return 8; } return 0; case TextureFormat::R8G8: return (component == 0 || component == 1) ? 8 : 0; case TextureFormat::G4R4: return (component == 0 || component == 1) ? 4 : 0; + default: + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return 0; } - UNIMPLEMENTED_MSG("Texture format not implemented={}", format); - return 0; } std::size_t GetImageComponentMask(TextureFormat format) { @@ -257,9 +260,10 @@ std::size_t GetImageComponentMask(TextureFormat format) { case TextureFormat::R8: case TextureFormat::R1: return std::size_t{R}; + default: + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return std::size_t{R | G | B | A}; } - UNIMPLEMENTED_MSG("Texture format not implemented={}", format); - return std::size_t{R | G | B | A}; } std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) { @@ -354,9 +358,9 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { instr.suldst.GetStoreDataLayout() != StoreType::Bits64); auto descriptor = [this, instr] { - std::optional<Tegra::Engines::SamplerDescriptor> descriptor; + std::optional<Tegra::Engines::SamplerDescriptor> sampler_descriptor; if (instr.suldst.is_immediate) { - descriptor = + sampler_descriptor = registry.ObtainBoundSampler(static_cast<u32>(instr.image.index.Value())); } else { const Node image_register = GetRegister(instr.gpr39); @@ -364,12 +368,12 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { static_cast<s64>(global_code.size())); const auto buffer = std::get<1>(result); const auto offset = std::get<2>(result); - descriptor = registry.ObtainBindlessSampler(buffer, offset); + sampler_descriptor = registry.ObtainBindlessSampler(buffer, offset); } - if (!descriptor) { + if (!sampler_descriptor) { UNREACHABLE_MSG("Failed to obtain image descriptor"); } - return *descriptor; + return *sampler_descriptor; }(); const auto comp_mask = GetImageComponentMask(descriptor.format); @@ -463,7 +467,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { return OperationCode::AtomicImageXor; case Tegra::Shader::ImageAtomicOperation::Exch: return OperationCode::AtomicImageExchange; + default: + break; } + break; default: break; } @@ -490,11 +497,12 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { return pc; } -Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) { +ImageEntry& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) { const auto offset = static_cast<u32>(image.index.Value()); - const auto it = std::find_if(std::begin(used_images), std::end(used_images), - [offset](const Image& entry) { return entry.offset == offset; }); + const auto it = + std::find_if(std::begin(used_images), std::end(used_images), + [offset](const ImageEntry& entry) { return entry.offset == offset; }); if (it != std::end(used_images)) { ASSERT(!it->is_bindless && it->type == type); return *it; @@ -504,7 +512,7 @@ Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType t return used_images.emplace_back(next_index, offset, type); } -Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) { +ImageEntry& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) { const Node image_register = GetRegister(reg); const auto result = TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size())); @@ -513,7 +521,7 @@ Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::Im const auto offset = std::get<2>(result); const auto it = std::find_if(std::begin(used_images), std::end(used_images), - [buffer, offset](const Image& entry) { + [buffer, offset](const ImageEntry& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != std::end(used_images)) { diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index e2bba88dd..50f4e7d35 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -47,7 +47,7 @@ OperationCode GetAtomOperation(AtomicOp op) { case AtomicOp::Exch: return OperationCode::AtomicIExchange; default: - UNIMPLEMENTED_MSG("op={}", static_cast<int>(op)); + UNIMPLEMENTED_MSG("op={}", op); return OperationCode::AtomicIAdd; } } @@ -83,7 +83,7 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) { case Tegra::Shader::UniformType::UnsignedQuad: return 128; default: - UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); + UNIMPLEMENTED_MSG("Unimplemented size={}!", uniform_type); return 32; } } @@ -175,12 +175,12 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } default: - UNIMPLEMENTED_MSG("Unhandled type: {}", static_cast<unsigned>(instr.ld_c.type.Value())); + UNIMPLEMENTED_MSG("Unhandled type: {}", instr.ld_c.type.Value()); } break; } case OpCode::Id::LD_L: - LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown)); + LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", instr.ld_l.unknown); [[fallthrough]]; case OpCode::Id::LD_S: { const auto GetAddress = [&](s32 offset) { @@ -224,7 +224,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } default: UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(), - static_cast<u32>(instr.ldst_sl.type.Value())); + instr.ldst_sl.type.Value()); } break; } @@ -306,8 +306,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::ST_L: - LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}", - static_cast<u64>(instr.st_l.cache_management.Value())); + LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}", instr.st_l.cache_management.Value()); [[fallthrough]]; case OpCode::Id::ST_S: { const auto GetAddress = [&](s32 offset) { @@ -340,7 +339,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } default: UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(), - static_cast<u32>(instr.ldst_sl.type.Value())); + instr.ldst_sl.type.Value()); } break; } @@ -387,7 +386,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } case OpCode::Id::RED: { UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32, "type={}", - static_cast<int>(instr.red.type.Value())); + instr.red.type.Value()); const auto [real_address, base_address, descriptor] = TrackGlobalMemory(bb, instr, true, true); if (!real_address || !base_address) { @@ -403,12 +402,12 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.atom.operation == AtomicOp::Inc || instr.atom.operation == AtomicOp::Dec || instr.atom.operation == AtomicOp::SafeAdd, - "operation={}", static_cast<int>(instr.atom.operation.Value())); + "operation={}", instr.atom.operation.Value()); UNIMPLEMENTED_IF_MSG(instr.atom.type == GlobalAtomicType::S64 || instr.atom.type == GlobalAtomicType::U64 || instr.atom.type == GlobalAtomicType::F16x2_FTZ_RN || instr.atom.type == GlobalAtomicType::F32_FTZ_RN, - "type={}", static_cast<int>(instr.atom.type.Value())); + "type={}", instr.atom.type.Value()); const auto [real_address, base_address, descriptor] = TrackGlobalMemory(bb, instr, true, true); @@ -428,10 +427,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { case OpCode::Id::ATOMS: { UNIMPLEMENTED_IF_MSG(instr.atoms.operation == AtomicOp::Inc || instr.atoms.operation == AtomicOp::Dec, - "operation={}", static_cast<int>(instr.atoms.operation.Value())); + "operation={}", instr.atoms.operation.Value()); UNIMPLEMENTED_IF_MSG(instr.atoms.type == AtomicType::S64 || instr.atoms.type == AtomicType::U64, - "type={}", static_cast<int>(instr.atoms.type.Value())); + "type={}", instr.atoms.type.Value()); const bool is_signed = instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64; const s32 offset = instr.atoms.GetImmediateOffset(); diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 29a7cfbfe..5f88537bc 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -34,14 +34,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::EXIT: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "EXIT condition code used: {}", cc); switch (instr.flow.cond) { case Tegra::Shader::FlowCondition::Always: bb.push_back(Operation(OperationCode::Exit)); - if (instr.pred.pred_index == static_cast<u64>(Tegra::Shader::Pred::UnusedIndex)) { + if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) { // If this is an unconditional exit then just end processing here, // otherwise we have to account for the possibility of the condition // not being met, so continue processing the next instruction. @@ -56,17 +55,15 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; default: - UNIMPLEMENTED_MSG("Unhandled flow condition: {}", - static_cast<u32>(instr.flow.cond.Value())); + UNIMPLEMENTED_MSG("Unhandled flow condition: {}", instr.flow.cond.Value()); } break; } case OpCode::Id::KIL: { UNIMPLEMENTED_IF(instr.flow.cond != Tegra::Shader::FlowCondition::Always); - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "KIL condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "KIL condition code used: {}", cc); bb.push_back(Operation(OperationCode::Discard)); break; @@ -79,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case SystemVariable::InvocationId: return Operation(OperationCode::InvocationId); case SystemVariable::Ydirection: + uses_y_negate = true; return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); @@ -90,11 +88,11 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_MSG("S2R WscaleFactorZ is not implemented"); return Immediate(0U); case SystemVariable::Tid: { - Node value = Immediate(0); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5); - return value; + Node val = Immediate(0); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdX), 0, 9); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdY), 16, 9); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdZ), 26, 5); + return val; } case SystemVariable::TidX: return Operation(OperationCode::LocalInvocationIdX); @@ -130,8 +128,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { return Immediate(0u); } default: - UNIMPLEMENTED_MSG("Unhandled system move: {}", - static_cast<u32>(instr.sys20.Value())); + UNIMPLEMENTED_MSG("Unhandled system move: {}", instr.sys20.Value()); return Immediate(0u); } }(); @@ -181,8 +178,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { } const Node branch = Operation(OperationCode::BranchIndirect, operand); - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - if (cc != Tegra::Shader::ConditionCode::T) { + const ConditionCode cc = instr.flow_condition_code; + if (cc != ConditionCode::T) { bb.push_back(Conditional(GetConditionCode(cc), {branch})); } else { bb.push_back(branch); @@ -218,9 +215,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::SYNC: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "SYNC condition code used: {}", cc); if (decompiled) { break; @@ -231,9 +227,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::BRK: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "BRK condition code used: {}", cc); if (decompiled) { break; } @@ -306,7 +301,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case Tegra::Shader::MembarType::GL: return OperationCode::MemoryBarrierGlobal; default: - UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value())); + UNIMPLEMENTED_MSG("MEMBAR type={}", instr.membar.type.Value()); return OperationCode::MemoryBarrierGlobal; } }(); diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp index d4ffa8014..a53819c15 100644 --- a/src/video_core/shader/decode/shift.cpp +++ b/src/video_core/shader/decode/shift.cpp @@ -125,7 +125,7 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { case OpCode::Id::SHF_LEFT_IMM: { UNIMPLEMENTED_IF(instr.generates_cc); UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}", - static_cast<int>(instr.shf.xmode.Value())); + instr.shf.xmode.Value()); if (instr.is_b_imm) { op_b = Immediate(static_cast<u32>(instr.shf.immediate)); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 29ebf65ba..833fa2a39 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -34,7 +34,7 @@ static std::size_t GetCoordCount(TextureType texture_type) { case TextureType::TextureCube: return 3; default: - UNIMPLEMENTED_MSG("Unhandled texture type: {}", static_cast<u32>(texture_type)); + UNIMPLEMENTED_MSG("Unhandled texture type: {}", texture_type); return 0; } } @@ -141,7 +141,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { SamplerInfo info; info.is_shadow = is_depth_compare; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, info); Node4 values; for (u32 element = 0; element < values.size(); ++element) { @@ -173,9 +173,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { SamplerInfo info; info.type = texture_type; info.is_array = is_array; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(base_reg, info, index_var) - : GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(base_reg, info, index_var) + : GetSampler(instr.sampler, info); Node4 values; if (!sampler) { std::generate(values.begin(), values.end(), [this] { return Immediate(0); }); @@ -217,9 +217,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { [[fallthrough]]; case OpCode::Id::TXQ: { Node index_var; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(instr.gpr8, {}, index_var) - : GetSampler(instr.sampler, {}); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(instr.gpr8, {}, index_var) + : GetSampler(instr.sampler, {}); if (!sampler) { u32 indexer = 0; @@ -255,8 +255,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { break; } default: - UNIMPLEMENTED_MSG("Unhandled texture query type: {}", - static_cast<u32>(instr.txq.query_type.Value())); + UNIMPLEMENTED_MSG("Unhandled texture query type: {}", instr.txq.query_type.Value()); } break; } @@ -273,7 +272,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { info.type = texture_type; info.is_array = is_array; Node index_var; - const std::optional<Sampler> sampler = + const std::optional<SamplerEntry> sampler = is_bindless ? GetBindlessSampler(instr.gpr20, info, index_var) : GetSampler(instr.sampler, info); @@ -292,33 +291,36 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { break; } - std::vector<Node> coords; - - // TODO: Add coordinates for different samplers once other texture types are implemented. - switch (texture_type) { - case TextureType::Texture1D: - coords.push_back(GetRegister(instr.gpr8)); - break; - case TextureType::Texture2D: - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); - break; - default: - UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type)); + const u64 base_index = is_array ? 1 : 0; + const u64 num_components = [texture_type] { + switch (texture_type) { + case TextureType::Texture1D: + return 1; + case TextureType::Texture2D: + return 2; + case TextureType::TextureCube: + return 3; + default: + UNIMPLEMENTED_MSG("Unhandled texture type {}", texture_type); + return 2; + } + }(); + // TODO: What's the array component used for? - // Fallback to interpreting as a 2D texture for now - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); + std::vector<Node> coords; + coords.reserve(num_components); + for (u64 component = 0; component < num_components; ++component) { + coords.push_back(GetRegister(instr.gpr8.Value() + base_index + component)); } + u32 indexer = 0; for (u32 element = 0; element < 2; ++element) { if (!instr.tmml.IsComponentEnabled(element)) { continue; } - auto params = coords; MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var}; - const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); - SetTemporary(bb, indexer++, value); + Node value = Operation(OperationCode::TextureQueryLod, meta, coords); + SetTemporary(bb, indexer++, std::move(value)); } for (u32 i = 0; i < indexer; ++i) { SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); @@ -377,14 +379,15 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( return info; } -std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, - SamplerInfo sampler_info) { +std::optional<SamplerEntry> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, + SamplerInfo sampler_info) { const u32 offset = static_cast<u32>(sampler.index.Value()); const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. - const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [offset](const Sampler& entry) { return entry.offset == offset; }); + const auto it = + std::find_if(used_samplers.begin(), used_samplers.end(), + [offset](const SamplerEntry& entry) { return entry.offset == offset; }); if (it != used_samplers.end()) { ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); @@ -397,8 +400,8 @@ std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, *info.is_shadow, *info.is_buffer, false); } -std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, - Node& index_var) { +std::optional<SamplerEntry> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, + SamplerInfo info, Node& index_var) { const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); @@ -414,7 +417,7 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer, offset](const Sampler& entry) { + [buffer, offset](const SamplerEntry& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -434,11 +437,12 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); // Try to use an already created sampler if it exists - const auto it = std::find_if( - used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { - return offsets == std::pair{entry.offset, entry.secondary_offset} && - indices == std::pair{entry.buffer, entry.secondary_buffer}; - }); + const auto it = + std::find_if(used_samplers.begin(), used_samplers.end(), + [indices, offsets](const SamplerEntry& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); if (it != used_samplers.end()) { ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); @@ -458,7 +462,7 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( used_samplers.begin(), used_samplers.end(), - [base_offset](const Sampler& entry) { return entry.offset == base_offset; }); + [base_offset](const SamplerEntry& entry) { return entry.offset == base_offset; }); if (it != used_samplers.end()) { ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer && @@ -553,7 +557,6 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, const bool is_shadow = depth_compare != nullptr; const bool is_bindless = bindless_reg.has_value(); - UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow, "Illegal texture type"); @@ -564,9 +567,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, info.is_buffer = false; Node index_var; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(*bindless_reg, info, index_var) - : GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(*bindless_reg, info, index_var) + : GetSampler(instr.sampler, info); if (!sampler) { return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)}; } @@ -593,7 +596,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, lod = GetRegister(instr.gpr20.Value() + bias_offset); break; default: - UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); + UNIMPLEMENTED_MSG("Unimplemented process mode={}", process_mode); break; } @@ -723,7 +726,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de info.is_shadow = depth_compare; Node index_var; - const std::optional<Sampler> sampler = + const std::optional<SamplerEntry> sampler = is_bindless ? GetBindlessSampler(parameter_register++, info, index_var) : GetSampler(instr.sampler, info); Node4 values; @@ -763,7 +766,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { const auto texture_type{instr.tld.texture_type}; - const bool is_array{instr.tld.is_array}; + const bool is_array{instr.tld.is_array != 0}; const bool lod_enabled{instr.tld.GetTextureProcessMode() == TextureProcessMode::LL}; const std::size_t coord_count{GetCoordCount(texture_type)}; @@ -782,7 +785,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr}; // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr}; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, {}); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, {}); Node4 values; for (u32 element = 0; element < values.size(); ++element) { @@ -799,7 +802,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is info.type = texture_type; info.is_array = is_array; info.is_shadow = false; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, info); const std::size_t type_coord_count = GetCoordCount(texture_type); const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL; diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index 11b77f795..37433d783 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -27,7 +27,7 @@ OperationCode GetOperationCode(VoteOperation vote_op) { case VoteOperation::Eq: return OperationCode::VoteEqual; default: - UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op)); + UNREACHABLE_MSG("Invalid vote operation={}", vote_op); return OperationCode::VoteAll; } } diff --git a/src/video_core/shader/expr.h b/src/video_core/shader/expr.h index 4e8264367..cda284c72 100644 --- a/src/video_core/shader/expr.h +++ b/src/video_core/shader/expr.h @@ -76,7 +76,7 @@ public: class ExprPredicate final { public: - explicit ExprPredicate(u32 predicate) : predicate{predicate} {} + explicit ExprPredicate(u32 predicate_) : predicate{predicate_} {} bool operator==(const ExprPredicate& b) const { return predicate == b.predicate; @@ -91,7 +91,7 @@ public: class ExprCondCode final { public: - explicit ExprCondCode(ConditionCode cc) : cc{cc} {} + explicit ExprCondCode(ConditionCode condition_code) : cc{condition_code} {} bool operator==(const ExprCondCode& b) const { return cc == b.cc; @@ -121,7 +121,7 @@ public: class ExprGprEqual final { public: - ExprGprEqual(u32 gpr, u32 value) : gpr{gpr}, value{value} {} + explicit ExprGprEqual(u32 gpr_, u32 value_) : gpr{gpr_}, value{value_} {} bool operator==(const ExprGprEqual& b) const { return gpr == b.gpr && value == b.value; diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp index 5071c83ca..e18ccba8e 100644 --- a/src/video_core/shader/memory_util.cpp +++ b/src/video_core/shader/memory_util.cpp @@ -16,11 +16,10 @@ namespace VideoCommon::Shader { -GPUVAddr GetShaderAddress(Core::System& system, +GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d, Tegra::Engines::Maxwell3D::Regs::ShaderProgram program) { - const auto& gpu{system.GPU().Maxwell3D()}; - const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; - return gpu.regs.code_address.CodeAddress() + shader_config.offset; + const auto& shader_config{maxwell3d.regs.shader_config[static_cast<std::size_t>(program)]}; + return maxwell3d.regs.code_address.CodeAddress() + shader_config.offset; } bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { diff --git a/src/video_core/shader/memory_util.h b/src/video_core/shader/memory_util.h index be90d24fd..4624d38e6 100644 --- a/src/video_core/shader/memory_util.h +++ b/src/video_core/shader/memory_util.h @@ -11,10 +11,6 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" -namespace Core { -class System; -} - namespace Tegra { class MemoryManager; } @@ -27,7 +23,7 @@ constexpr u32 STAGE_MAIN_OFFSET = 10; constexpr u32 KERNEL_MAIN_OFFSET = 0; /// Gets the address for the specified shader stage program -GPUVAddr GetShaderAddress(Core::System& system, +GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d, Tegra::Engines::Maxwell3D::Regs::ShaderProgram program); /// Gets if the current instruction offset is a scheduler instruction diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 8f230d57a..b54d33763 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -282,26 +282,27 @@ struct SeparateSamplerNode; using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>; using TrackSampler = std::shared_ptr<TrackSamplerData>; -struct Sampler { +struct SamplerEntry { /// Bound samplers constructor - constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) - : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, - is_buffer{is_buffer}, is_indexed{is_indexed} {} + explicit SamplerEntry(u32 index_, u32 offset_, Tegra::Shader::TextureType type_, bool is_array_, + bool is_shadow_, bool is_buffer_, bool is_indexed_) + : index{index_}, offset{offset_}, type{type_}, is_array{is_array_}, is_shadow{is_shadow_}, + is_buffer{is_buffer_}, is_indexed{is_indexed_} {} /// Separate sampler constructor - constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, - Tegra::Shader::TextureType type, bool is_array, bool is_shadow, - bool is_buffer) - : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, - buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, - is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + explicit SamplerEntry(u32 index_, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, + Tegra::Shader::TextureType type_, bool is_array_, bool is_shadow_, + bool is_buffer_) + : index{index_}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type_}, is_array{is_array_}, + is_shadow{is_shadow_}, is_buffer{is_buffer_}, is_separated{true} {} /// Bindless samplers constructor - constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) - : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, - is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} + explicit SamplerEntry(u32 index_, u32 offset_, u32 buffer_, Tegra::Shader::TextureType type_, + bool is_array_, bool is_shadow_, bool is_buffer_, bool is_indexed_) + : index{index_}, offset{offset_}, buffer{buffer_}, type{type_}, is_array{is_array_}, + is_shadow{is_shadow_}, is_buffer{is_buffer_}, is_bindless{true}, is_indexed{is_indexed_} { + } u32 index = 0; ///< Emulated index given for the this sampler. u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. @@ -338,15 +339,15 @@ struct BindlessSamplerNode { u32 offset; }; -struct Image { +struct ImageEntry { public: /// Bound images constructor - constexpr explicit Image(u32 index, u32 offset, Tegra::Shader::ImageType type) - : index{index}, offset{offset}, type{type} {} + explicit ImageEntry(u32 index_, u32 offset_, Tegra::Shader::ImageType type_) + : index{index_}, offset{offset_}, type{type_} {} /// Bindless samplers constructor - constexpr explicit Image(u32 index, u32 offset, u32 buffer, Tegra::Shader::ImageType type) - : index{index}, offset{offset}, buffer{buffer}, type{type}, is_bindless{true} {} + explicit ImageEntry(u32 index_, u32 offset_, u32 buffer_, Tegra::Shader::ImageType type_) + : index{index_}, offset{offset_}, buffer{buffer_}, type{type_}, is_bindless{true} {} void MarkWrite() { is_written = true; @@ -377,7 +378,7 @@ struct GlobalMemoryBase { u32 cbuf_index = 0; u32 cbuf_offset = 0; - bool operator<(const GlobalMemoryBase& rhs) const { + [[nodiscard]] bool operator<(const GlobalMemoryBase& rhs) const { return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset); } }; @@ -389,7 +390,7 @@ struct MetaArithmetic { /// Parameters describing a texture sampler struct MetaTexture { - Sampler sampler; + SamplerEntry sampler; Node array; Node depth_compare; std::vector<Node> aoffi; @@ -403,7 +404,7 @@ struct MetaTexture { }; struct MetaImage { - const Image& image; + const ImageEntry& image; std::vector<Node> values; u32 element{}; }; @@ -414,7 +415,7 @@ using Meta = class AmendNode { public: - std::optional<std::size_t> GetAmendIndex() const { + [[nodiscard]] std::optional<std::size_t> GetAmendIndex() const { if (amend_index == amend_null_index) { return std::nullopt; } @@ -437,34 +438,34 @@ private: /// Holds any kind of operation that can be done in the IR class OperationNode final : public AmendNode { public: - explicit OperationNode(OperationCode code) : OperationNode(code, Meta{}) {} + explicit OperationNode(OperationCode code_) : OperationNode(code_, Meta{}) {} - explicit OperationNode(OperationCode code, Meta meta) - : OperationNode(code, std::move(meta), std::vector<Node>{}) {} + explicit OperationNode(OperationCode code_, Meta meta_) + : OperationNode(code_, std::move(meta_), std::vector<Node>{}) {} - explicit OperationNode(OperationCode code, std::vector<Node> operands) - : OperationNode(code, Meta{}, std::move(operands)) {} + explicit OperationNode(OperationCode code_, std::vector<Node> operands_) + : OperationNode(code_, Meta{}, std::move(operands_)) {} - explicit OperationNode(OperationCode code, Meta meta, std::vector<Node> operands) - : code{code}, meta{std::move(meta)}, operands{std::move(operands)} {} + explicit OperationNode(OperationCode code_, Meta meta_, std::vector<Node> operands_) + : code{code_}, meta{std::move(meta_)}, operands{std::move(operands_)} {} template <typename... Args> - explicit OperationNode(OperationCode code, Meta meta, Args&&... operands) - : code{code}, meta{std::move(meta)}, operands{operands...} {} + explicit OperationNode(OperationCode code_, Meta meta_, Args&&... operands_) + : code{code_}, meta{std::move(meta_)}, operands{operands_...} {} - OperationCode GetCode() const { + [[nodiscard]] OperationCode GetCode() const { return code; } - const Meta& GetMeta() const { + [[nodiscard]] const Meta& GetMeta() const { return meta; } - std::size_t GetOperandsCount() const { + [[nodiscard]] std::size_t GetOperandsCount() const { return operands.size(); } - const Node& operator[](std::size_t operand_index) const { + [[nodiscard]] const Node& operator[](std::size_t operand_index) const { return operands.at(operand_index); } @@ -477,14 +478,14 @@ private: /// Encloses inside any kind of node that returns a boolean conditionally-executed code class ConditionalNode final : public AmendNode { public: - explicit ConditionalNode(Node condition, std::vector<Node>&& code) - : condition{std::move(condition)}, code{std::move(code)} {} + explicit ConditionalNode(Node condition_, std::vector<Node>&& code_) + : condition{std::move(condition_)}, code{std::move(code_)} {} - const Node& GetCondition() const { + [[nodiscard]] const Node& GetCondition() const { return condition; } - const std::vector<Node>& GetCode() const { + [[nodiscard]] const std::vector<Node>& GetCode() const { return code; } @@ -496,9 +497,9 @@ private: /// A general purpose register class GprNode final { public: - explicit constexpr GprNode(Tegra::Shader::Register index) : index{index} {} + explicit constexpr GprNode(Tegra::Shader::Register index_) : index{index_} {} - u32 GetIndex() const { + [[nodiscard]] constexpr u32 GetIndex() const { return static_cast<u32>(index); } @@ -509,9 +510,9 @@ private: /// A custom variable class CustomVarNode final { public: - explicit constexpr CustomVarNode(u32 index) : index{index} {} + explicit constexpr CustomVarNode(u32 index_) : index{index_} {} - constexpr u32 GetIndex() const { + [[nodiscard]] constexpr u32 GetIndex() const { return index; } @@ -522,9 +523,9 @@ private: /// A 32-bits value that represents an immediate value class ImmediateNode final { public: - explicit constexpr ImmediateNode(u32 value) : value{value} {} + explicit constexpr ImmediateNode(u32 value_) : value{value_} {} - u32 GetValue() const { + [[nodiscard]] constexpr u32 GetValue() const { return value; } @@ -535,9 +536,9 @@ private: /// One of Maxwell's internal flags class InternalFlagNode final { public: - explicit constexpr InternalFlagNode(InternalFlag flag) : flag{flag} {} + explicit constexpr InternalFlagNode(InternalFlag flag_) : flag{flag_} {} - InternalFlag GetFlag() const { + [[nodiscard]] constexpr InternalFlag GetFlag() const { return flag; } @@ -548,14 +549,14 @@ private: /// A predicate register, it can be negated without additional nodes class PredicateNode final { public: - explicit constexpr PredicateNode(Tegra::Shader::Pred index, bool negated) - : index{index}, negated{negated} {} + explicit constexpr PredicateNode(Tegra::Shader::Pred index_, bool negated_) + : index{index_}, negated{negated_} {} - Tegra::Shader::Pred GetIndex() const { + [[nodiscard]] constexpr Tegra::Shader::Pred GetIndex() const { return index; } - bool IsNegated() const { + [[nodiscard]] constexpr bool IsNegated() const { return negated; } @@ -568,30 +569,30 @@ private: class AbufNode final { public: // Initialize for standard attributes (index is explicit). - explicit AbufNode(Tegra::Shader::Attribute::Index index, u32 element, Node buffer = {}) - : buffer{std::move(buffer)}, index{index}, element{element} {} + explicit AbufNode(Tegra::Shader::Attribute::Index index_, u32 element_, Node buffer_ = {}) + : buffer{std::move(buffer_)}, index{index_}, element{element_} {} // Initialize for physical attributes (index is a variable value). - explicit AbufNode(Node physical_address, Node buffer = {}) - : physical_address{std::move(physical_address)}, buffer{std::move(buffer)} {} + explicit AbufNode(Node physical_address_, Node buffer_ = {}) + : physical_address{std::move(physical_address_)}, buffer{std::move(buffer_)} {} - Tegra::Shader::Attribute::Index GetIndex() const { + [[nodiscard]] Tegra::Shader::Attribute::Index GetIndex() const { return index; } - u32 GetElement() const { + [[nodiscard]] u32 GetElement() const { return element; } - const Node& GetBuffer() const { + [[nodiscard]] const Node& GetBuffer() const { return buffer; } - bool IsPhysicalBuffer() const { + [[nodiscard]] bool IsPhysicalBuffer() const { return static_cast<bool>(physical_address); } - const Node& GetPhysicalAddress() const { + [[nodiscard]] const Node& GetPhysicalAddress() const { return physical_address; } @@ -605,9 +606,9 @@ private: /// Patch memory (used to communicate tessellation stages). class PatchNode final { public: - explicit PatchNode(u32 offset) : offset{offset} {} + explicit constexpr PatchNode(u32 offset_) : offset{offset_} {} - u32 GetOffset() const { + [[nodiscard]] constexpr u32 GetOffset() const { return offset; } @@ -618,13 +619,13 @@ private: /// Constant buffer node, usually mapped to uniform buffers in GLSL class CbufNode final { public: - explicit CbufNode(u32 index, Node offset) : index{index}, offset{std::move(offset)} {} + explicit CbufNode(u32 index_, Node offset_) : index{index_}, offset{std::move(offset_)} {} - u32 GetIndex() const { + [[nodiscard]] u32 GetIndex() const { return index; } - const Node& GetOffset() const { + [[nodiscard]] const Node& GetOffset() const { return offset; } @@ -636,9 +637,9 @@ private: /// Local memory node class LmemNode final { public: - explicit LmemNode(Node address) : address{std::move(address)} {} + explicit LmemNode(Node address_) : address{std::move(address_)} {} - const Node& GetAddress() const { + [[nodiscard]] const Node& GetAddress() const { return address; } @@ -649,9 +650,9 @@ private: /// Shared memory node class SmemNode final { public: - explicit SmemNode(Node address) : address{std::move(address)} {} + explicit SmemNode(Node address_) : address{std::move(address_)} {} - const Node& GetAddress() const { + [[nodiscard]] const Node& GetAddress() const { return address; } @@ -662,19 +663,19 @@ private: /// Global memory node class GmemNode final { public: - explicit GmemNode(Node real_address, Node base_address, const GlobalMemoryBase& descriptor) - : real_address{std::move(real_address)}, base_address{std::move(base_address)}, - descriptor{descriptor} {} + explicit GmemNode(Node real_address_, Node base_address_, const GlobalMemoryBase& descriptor_) + : real_address{std::move(real_address_)}, base_address{std::move(base_address_)}, + descriptor{descriptor_} {} - const Node& GetRealAddress() const { + [[nodiscard]] const Node& GetRealAddress() const { return real_address; } - const Node& GetBaseAddress() const { + [[nodiscard]] const Node& GetBaseAddress() const { return base_address; } - const GlobalMemoryBase& GetDescriptor() const { + [[nodiscard]] const GlobalMemoryBase& GetDescriptor() const { return descriptor; } @@ -687,9 +688,9 @@ private: /// Commentary, can be dropped class CommentNode final { public: - explicit CommentNode(std::string text) : text{std::move(text)} {} + explicit CommentNode(std::string text_) : text{std::move(text_)} {} - const std::string& GetText() const { + [[nodiscard]] const std::string& GetText() const { return text; } diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index 7bf4ff387..6a5b6940d 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -107,7 +107,7 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) UNREACHABLE_MSG("Can't apply absolute to an unsigned integer"); return {}; default: - UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code)); + UNREACHABLE_MSG("Unknown signed operation with code={}", operation_code); return {}; } } diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index cdf274e54..148d91fcb 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -24,44 +24,45 @@ GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterfac if (shader_stage == ShaderType::Compute) { return {}; } - auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine); - - GraphicsInfo info; - info.tfb_layouts = graphics.regs.tfb_layouts; - info.tfb_varying_locs = graphics.regs.tfb_varying_locs; - info.primitive_topology = graphics.regs.draw.topology; - info.tessellation_primitive = graphics.regs.tess_mode.prim; - info.tessellation_spacing = graphics.regs.tess_mode.spacing; - info.tfb_enabled = graphics.regs.tfb_enabled; - info.tessellation_clockwise = graphics.regs.tess_mode.cw; - return info; + + auto& graphics = dynamic_cast<Tegra::Engines::Maxwell3D&>(engine); + + return { + .tfb_layouts = graphics.regs.tfb_layouts, + .tfb_varying_locs = graphics.regs.tfb_varying_locs, + .primitive_topology = graphics.regs.draw.topology, + .tessellation_primitive = graphics.regs.tess_mode.prim, + .tessellation_spacing = graphics.regs.tess_mode.spacing, + .tfb_enabled = graphics.regs.tfb_enabled != 0, + .tessellation_clockwise = graphics.regs.tess_mode.cw.Value() != 0, + }; } ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { if (shader_stage != ShaderType::Compute) { return {}; } - auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine); + + auto& compute = dynamic_cast<Tegra::Engines::KeplerCompute&>(engine); const auto& launch = compute.launch_description; - ComputeInfo info; - info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}; - info.local_memory_size_in_words = launch.local_pos_alloc; - info.shared_memory_size_in_words = launch.shared_alloc; - return info; + return { + .workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}, + .shared_memory_size_in_words = launch.shared_alloc, + .local_memory_size_in_words = launch.local_pos_alloc, + }; } } // Anonymous namespace -Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info) +Registry::Registry(ShaderType shader_stage, const SerializedRegistryInfo& info) : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile}, bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {} -Registry::Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine) - : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()}, - graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo( - shader_stage, engine)} {} +Registry::Registry(ShaderType shader_stage, ConstBufferEngineInterface& engine_) + : stage{shader_stage}, engine{&engine_}, bound_buffer{engine_.GetBoundBuffer()}, + graphics_info{MakeGraphicsInfo(shader_stage, engine_)}, compute_info{MakeComputeInfo( + shader_stage, engine_)} {} Registry::~Registry() = default; @@ -113,8 +114,7 @@ std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler return value; } -std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, - u32 offset) { +std::optional<SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; const auto iter = bindless_samplers.find(key); if (iter != bindless_samplers.end()) { diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 231206765..4bebefdde 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -94,7 +94,7 @@ public: explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info); explicit Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine); + Tegra::Engines::ConstBufferEngineInterface& engine_); ~Registry(); diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 29d794b34..a4987ffc6 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -25,9 +25,10 @@ using Tegra::Shader::PredCondition; using Tegra::Shader::PredOperation; using Tegra::Shader::Register; -ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - Registry& registry) - : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} { +ShaderIR::ShaderIR(const ProgramCode& program_code_, u32 main_offset_, CompilerSettings settings_, + Registry& registry_) + : program_code{program_code_}, main_offset{main_offset_}, settings{settings_}, registry{ + registry_} { Decode(); PostDecode(); } @@ -170,7 +171,7 @@ Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signe // Default - do nothing return value; default: - UNREACHABLE_MSG("Unimplemented conversion size: {}", static_cast<u32>(size)); + UNREACHABLE_MSG("Unimplemented conversion size: {}", size); return value; } } @@ -335,15 +336,15 @@ OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { return operation_table[index]; } -Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) const { +Node ShaderIR::GetConditionCode(ConditionCode cc) const { switch (cc) { - case Tegra::Shader::ConditionCode::NEU: + case ConditionCode::NEU: return GetInternalFlag(InternalFlag::Zero, true); - case Tegra::Shader::ConditionCode::FCSM_TR: + case ConditionCode::FCSM_TR: UNIMPLEMENTED_MSG("EXIT.FCSM_TR is not implemented"); return MakeNode<PredicateNode>(Pred::NeverExecute, false); default: - UNIMPLEMENTED_MSG("Unimplemented condition code: {}", static_cast<u32>(cc)); + UNIMPLEMENTED_MSG("Unimplemented condition code: {}", cc); return MakeNode<PredicateNode>(Pred::NeverExecute, false); } } @@ -451,8 +452,8 @@ void ShaderIR::MarkAttributeUsage(Attribute::Index index, u64 element) { } std::size_t ShaderIR::DeclareAmend(Node new_amend) { - const std::size_t id = amend_code.size(); - amend_code.push_back(new_amend); + const auto id = amend_code.size(); + amend_code.push_back(std::move(new_amend)); return id; } diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 3a98b2104..1cd7c14d7 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -29,8 +29,8 @@ struct ShaderBlock; constexpr u32 MAX_PROGRAM_LENGTH = 0x1000; struct ConstBuffer { - constexpr explicit ConstBuffer(u32 max_offset, bool is_indirect) - : max_offset{max_offset}, is_indirect{is_indirect} {} + constexpr explicit ConstBuffer(u32 max_offset_, bool is_indirect_) + : max_offset{max_offset_}, is_indirect{is_indirect_} {} constexpr ConstBuffer() = default; @@ -66,8 +66,8 @@ struct GlobalMemoryUsage { class ShaderIR final { public: - explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - Registry& registry); + explicit ShaderIR(const ProgramCode& program_code_, u32 main_offset_, + CompilerSettings settings_, Registry& registry_); ~ShaderIR(); const std::map<u32, NodeBlock>& GetBasicBlocks() const { @@ -94,11 +94,11 @@ public: return used_cbufs; } - const std::list<Sampler>& GetSamplers() const { + const std::list<SamplerEntry>& GetSamplers() const { return used_samplers; } - const std::list<Image>& GetImages() const { + const std::list<ImageEntry>& GetImages() const { return used_images; } @@ -139,6 +139,10 @@ public: return uses_legacy_varyings; } + bool UsesYNegate() const { + return uses_y_negate; + } + bool UsesWarps() const { return uses_warps; } @@ -334,17 +338,17 @@ private: std::optional<Tegra::Engines::SamplerDescriptor> sampler); /// Accesses a texture sampler. - std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); + std::optional<SamplerEntry> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); /// Accesses a texture sampler for a bindless texture. - std::optional<Sampler> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, - Node& index_var); + std::optional<SamplerEntry> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, + Node& index_var); /// Accesses an image. - Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type); + ImageEntry& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type); /// Access a bindless image sampler. - Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type); + ImageEntry& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type); /// Extracts a sequence of bits from a node Node BitfieldExtract(Node value, u32 offset, u32 bits); @@ -454,8 +458,8 @@ private: std::set<Tegra::Shader::Attribute::Index> used_input_attributes; std::set<Tegra::Shader::Attribute::Index> used_output_attributes; std::map<u32, ConstBuffer> used_cbufs; - std::list<Sampler> used_samplers; - std::list<Image> used_images; + std::list<SamplerEntry> used_samplers; + std::list<ImageEntry> used_images; std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{}; std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory; bool uses_layer{}; @@ -465,6 +469,7 @@ private: bool uses_instance_id{}; bool uses_vertex_id{}; bool uses_legacy_varyings{}; + bool uses_y_negate{}; bool uses_warps{}; bool uses_indexed_samplers{}; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index d5ed81442..6be3ea92b 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -205,12 +205,12 @@ std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, const auto result = TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1); const auto& found = result.first; if (!found) { - return {}; + return std::nullopt; } if (const auto immediate = std::get_if<ImmediateNode>(&*found)) { return immediate->GetValue(); } - return {}; + return std::nullopt; } std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code, diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1688267bb..6308aef94 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -28,7 +28,7 @@ SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_t case Tegra::Texture::TextureType::Texture2DArray: return SurfaceTarget::Texture2DArray; default: - LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type)); + LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", texture_type); UNREACHABLE(); return SurfaceTarget::Texture2D; } @@ -47,7 +47,7 @@ bool SurfaceTargetIsLayered(SurfaceTarget target) { case SurfaceTarget::TextureCubeArray: return true; default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); + LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", target); UNREACHABLE(); return false; } @@ -66,7 +66,7 @@ bool SurfaceTargetIsArray(SurfaceTarget target) { case SurfaceTarget::TextureCubeArray: return true; default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); + LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", target); UNREACHABLE(); return false; } @@ -85,7 +85,7 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { case Tegra::DepthFormat::D32_FLOAT_S8X24_UINT: return PixelFormat::D32_FLOAT_S8_UINT; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::S8_UINT_D24_UNORM; } } @@ -183,7 +183,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) case Tegra::RenderTargetFormat::R8_UINT: return PixelFormat::R8_UINT; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<int>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::A8B8G8R8_UNORM; } } @@ -197,7 +197,7 @@ PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat case Tegra::FramebufferConfig::PixelFormat::B8G8R8A8_UNORM: return PixelFormat::B8G8R8A8_UNORM; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::A8B8G8R8_UNORM; } } @@ -280,7 +280,7 @@ bool IsPixelFormatSRGB(PixelFormat format) { } std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { - return {GetDefaultBlockWidth(format), GetDefaultBlockHeight(format)}; + return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } } // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h index cfd12fa61..c40ab89d0 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -120,7 +120,7 @@ enum class PixelFormat { Max = MaxDepthStencilFormat, Invalid = 255, }; -static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); +constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); enum class SurfaceType { ColorTexture = 0, @@ -140,117 +140,7 @@ enum class SurfaceTarget { TextureCubeArray, }; -constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ - 0, // A8B8G8R8_UNORM - 0, // A8B8G8R8_SNORM - 0, // A8B8G8R8_SINT - 0, // A8B8G8R8_UINT - 0, // R5G6B5_UNORM - 0, // B5G6R5_UNORM - 0, // A1R5G5B5_UNORM - 0, // A2B10G10R10_UNORM - 0, // A2B10G10R10_UINT - 0, // A1B5G5R5_UNORM - 0, // R8_UNORM - 0, // R8_SNORM - 0, // R8_SINT - 0, // R8_UINT - 0, // R16G16B16A16_FLOAT - 0, // R16G16B16A16_UNORM - 0, // R16G16B16A16_SNORM - 0, // R16G16B16A16_SINT - 0, // R16G16B16A16_UINT - 0, // B10G11R11_FLOAT - 0, // R32G32B32A32_UINT - 2, // BC1_RGBA_UNORM - 2, // BC2_UNORM - 2, // BC3_UNORM - 2, // BC4_UNORM - 2, // BC4_SNORM - 2, // BC5_UNORM - 2, // BC5_SNORM - 2, // BC7_UNORM - 2, // BC6H_UFLOAT - 2, // BC6H_SFLOAT - 2, // ASTC_2D_4X4_UNORM - 0, // B8G8R8A8_UNORM - 0, // R32G32B32A32_FLOAT - 0, // R32G32B32A32_SINT - 0, // R32G32_FLOAT - 0, // R32G32_SINT - 0, // R32_FLOAT - 0, // R16_FLOAT - 0, // R16_UNORM - 0, // R16_SNORM - 0, // R16_UINT - 0, // R16_SINT - 0, // R16G16_UNORM - 0, // R16G16_FLOAT - 0, // R16G16_UINT - 0, // R16G16_SINT - 0, // R16G16_SNORM - 0, // R32G32B32_FLOAT - 0, // A8B8G8R8_SRGB - 0, // R8G8_UNORM - 0, // R8G8_SNORM - 0, // R8G8_SINT - 0, // R8G8_UINT - 0, // R32G32_UINT - 0, // R16G16B16X16_FLOAT - 0, // R32_UINT - 0, // R32_SINT - 2, // ASTC_2D_8X8_UNORM - 2, // ASTC_2D_8X5_UNORM - 2, // ASTC_2D_5X4_UNORM - 0, // B8G8R8A8_SRGB - 2, // BC1_RGBA_SRGB - 2, // BC2_SRGB - 2, // BC3_SRGB - 2, // BC7_SRGB - 0, // A4B4G4R4_UNORM - 2, // ASTC_2D_4X4_SRGB - 2, // ASTC_2D_8X8_SRGB - 2, // ASTC_2D_8X5_SRGB - 2, // ASTC_2D_5X4_SRGB - 2, // ASTC_2D_5X5_UNORM - 2, // ASTC_2D_5X5_SRGB - 2, // ASTC_2D_10X8_UNORM - 2, // ASTC_2D_10X8_SRGB - 2, // ASTC_2D_6X6_UNORM - 2, // ASTC_2D_6X6_SRGB - 2, // ASTC_2D_10X10_UNORM - 2, // ASTC_2D_10X10_SRGB - 2, // ASTC_2D_12X12_UNORM - 2, // ASTC_2D_12X12_SRGB - 2, // ASTC_2D_8X6_UNORM - 2, // ASTC_2D_8X6_SRGB - 2, // ASTC_2D_6X5_UNORM - 2, // ASTC_2D_6X5_SRGB - 0, // E5B9G9R9_FLOAT - 0, // D32_FLOAT - 0, // D16_UNORM - 0, // D24_UNORM_S8_UINT - 0, // S8_UINT_D24_UNORM - 0, // D32_FLOAT_S8_UINT -}}; - -/** - * Gets the compression factor for the specified PixelFormat. This applies to just the - * "compressed width" and "compressed height", not the overall compression factor of a - * compressed image. This is used for maintaining proper surface sizes for compressed - * texture formats. - */ -inline constexpr u32 GetCompressionFactorShift(PixelFormat format) { - DEBUG_ASSERT(format != PixelFormat::Invalid); - DEBUG_ASSERT(static_cast<std::size_t>(format) < compression_factor_shift_table.size()); - return compression_factor_shift_table[static_cast<std::size_t>(format)]; -} - -inline constexpr u32 GetCompressionFactor(PixelFormat format) { - return 1U << GetCompressionFactorShift(format); -} - -constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ +constexpr std::array<u32, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{ 1, // A8B8G8R8_UNORM 1, // A8B8G8R8_SNORM 1, // A8B8G8R8_SINT @@ -344,15 +234,12 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetDefaultBlockWidth(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < block_width_table.size()); - return block_width_table[static_cast<std::size_t>(format)]; +constexpr u32 DefaultBlockWidth(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BLOCK_WIDTH_TABLE.size()); + return BLOCK_WIDTH_TABLE[static_cast<std::size_t>(format)]; } -constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ +constexpr std::array<u32, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{ 1, // A8B8G8R8_UNORM 1, // A8B8G8R8_SNORM 1, // A8B8G8R8_SINT @@ -446,15 +333,12 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < block_height_table.size()); - return block_height_table[static_cast<std::size_t>(format)]; +constexpr u32 DefaultBlockHeight(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BLOCK_HEIGHT_TABLE.size()); + return BLOCK_HEIGHT_TABLE[static_cast<std::size_t>(format)]; } -constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ +constexpr std::array<u32, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{ 32, // A8B8G8R8_UNORM 32, // A8B8G8R8_SNORM 32, // A8B8G8R8_SINT @@ -548,20 +432,14 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 64, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetFormatBpp(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < bpp_table.size()); - return bpp_table[static_cast<std::size_t>(format)]; +constexpr u32 BitsPerBlock(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BITS_PER_BLOCK_TABLE.size()); + return BITS_PER_BLOCK_TABLE[static_cast<std::size_t>(format)]; } /// Returns the sizer in bytes of the specified pixel format -static constexpr u32 GetBytesPerPixel(PixelFormat pixel_format) { - if (pixel_format == PixelFormat::Invalid) { - return 0; - } - return GetFormatBpp(pixel_format) / CHAR_BIT; +constexpr u32 BytesPerBlock(PixelFormat pixel_format) { + return BitsPerBlock(pixel_format) / CHAR_BIT; } SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type); diff --git a/src/video_core/texture_cache/accelerated_swizzle.cpp b/src/video_core/texture_cache/accelerated_swizzle.cpp new file mode 100644 index 000000000..15585caeb --- /dev/null +++ b/src/video_core/texture_cache/accelerated_swizzle.cpp @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bit> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/accelerated_swizzle.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/decoders.h" + +namespace VideoCommon::Accelerated { + +using Tegra::Texture::GOB_SIZE_SHIFT; +using Tegra::Texture::GOB_SIZE_X; +using Tegra::Texture::GOB_SIZE_X_SHIFT; +using Tegra::Texture::GOB_SIZE_Y_SHIFT; +using VideoCore::Surface::BytesPerBlock; + +BlockLinearSwizzle2DParams MakeBlockLinearSwizzle2DParams(const SwizzleParameters& swizzle, + const ImageInfo& info) { + const Extent3D block = swizzle.block; + const Extent3D num_tiles = swizzle.num_tiles; + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 stride_alignment = CalculateLevelStrideAlignment(info, swizzle.level); + const u32 stride = Common::AlignUpLog2(num_tiles.width, stride_alignment) * bytes_per_block; + const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); + return BlockLinearSwizzle2DParams{ + .origin{0, 0, 0}, + .destination{0, 0, 0}, + .bytes_per_block_log2 = static_cast<u32>(std::countr_zero(bytes_per_block)), + .layer_stride = info.layer_stride, + .block_size = gobs_in_x << (GOB_SIZE_SHIFT + block.height + block.depth), + .x_shift = GOB_SIZE_SHIFT + block.height + block.depth, + .block_height = block.height, + .block_height_mask = (1U << block.height) - 1, + }; +} + +BlockLinearSwizzle3DParams MakeBlockLinearSwizzle3DParams(const SwizzleParameters& swizzle, + const ImageInfo& info) { + const Extent3D block = swizzle.block; + const Extent3D num_tiles = swizzle.num_tiles; + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 stride_alignment = CalculateLevelStrideAlignment(info, swizzle.level); + const u32 stride = Common::AlignUpLog2(num_tiles.width, stride_alignment) * bytes_per_block; + + const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) >> GOB_SIZE_X_SHIFT; + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block.height + block.depth); + const u32 slice_size = + Common::DivCeilLog2(num_tiles.height, block.height + GOB_SIZE_Y_SHIFT) * block_size; + return BlockLinearSwizzle3DParams{ + .origin{0, 0, 0}, + .destination{0, 0, 0}, + .bytes_per_block_log2 = static_cast<u32>(std::countr_zero(bytes_per_block)), + .slice_size = slice_size, + .block_size = block_size, + .x_shift = GOB_SIZE_SHIFT + block.height + block.depth, + .block_height = block.height, + .block_height_mask = (1U << block.height) - 1, + .block_depth = block.depth, + .block_depth_mask = (1U << block.depth) - 1, + }; +} + +} // namespace VideoCommon::Accelerated
\ No newline at end of file diff --git a/src/video_core/texture_cache/accelerated_swizzle.h b/src/video_core/texture_cache/accelerated_swizzle.h new file mode 100644 index 000000000..6ec5c78c4 --- /dev/null +++ b/src/video_core/texture_cache/accelerated_swizzle.h @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> + +#include "common/common_types.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon::Accelerated { + +struct BlockLinearSwizzle2DParams { + std::array<u32, 3> origin; + std::array<s32, 3> destination; + u32 bytes_per_block_log2; + u32 layer_stride; + u32 block_size; + u32 x_shift; + u32 block_height; + u32 block_height_mask; +}; + +struct BlockLinearSwizzle3DParams { + std::array<u32, 3> origin; + std::array<s32, 3> destination; + u32 bytes_per_block_log2; + u32 slice_size; + u32 block_size; + u32 x_shift; + u32 block_height; + u32 block_height_mask; + u32 block_depth; + u32 block_depth_mask; +}; + +[[nodiscard]] BlockLinearSwizzle2DParams MakeBlockLinearSwizzle2DParams( + const SwizzleParameters& swizzle, const ImageInfo& info); + +[[nodiscard]] BlockLinearSwizzle3DParams MakeBlockLinearSwizzle3DParams( + const SwizzleParameters& swizzle, const ImageInfo& info); + +} // namespace VideoCommon::Accelerated diff --git a/src/video_core/texture_cache/copy_params.h b/src/video_core/texture_cache/copy_params.h deleted file mode 100644 index 9c21a0649..000000000 --- a/src/video_core/texture_cache/copy_params.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCommon { - -struct CopyParams { - constexpr CopyParams(u32 source_x, u32 source_y, u32 source_z, u32 dest_x, u32 dest_y, - u32 dest_z, u32 source_level, u32 dest_level, u32 width, u32 height, - u32 depth) - : source_x{source_x}, source_y{source_y}, source_z{source_z}, dest_x{dest_x}, - dest_y{dest_y}, dest_z{dest_z}, source_level{source_level}, - dest_level{dest_level}, width{width}, height{height}, depth{depth} {} - - constexpr CopyParams(u32 width, u32 height, u32 depth, u32 level) - : source_x{}, source_y{}, source_z{}, dest_x{}, dest_y{}, dest_z{}, source_level{level}, - dest_level{level}, width{width}, height{height}, depth{depth} {} - - u32 source_x; - u32 source_y; - u32 source_z; - u32 dest_x; - u32 dest_y; - u32 dest_z; - u32 source_level; - u32 dest_level; - u32 width; - u32 height; - u32 depth; -}; - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp new file mode 100644 index 000000000..017327975 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc4.cpp @@ -0,0 +1,97 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <span> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt +[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { + const u32 code_offset = 16 + 3 * (4 * y + x); + const u32 code = (bits >> code_offset) & 7; + const u32 red0 = (bits >> 0) & 0xff; + const u32 red1 = (bits >> 8) & 0xff; + if (red0 > red1) { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (6 * red0 + 1 * red1) / 7; + case 3: + return (5 * red0 + 2 * red1) / 7; + case 4: + return (4 * red0 + 3 * red1) / 7; + case 5: + return (3 * red0 + 4 * red1) / 7; + case 6: + return (2 * red0 + 5 * red1) / 7; + case 7: + return (1 * red0 + 6 * red1) / 7; + } + } else { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (4 * red0 + 1 * red1) / 5; + case 3: + return (3 * red0 + 2 * red1) / 5; + case 4: + return (2 * red0 + 3 * red1) / 5; + case 5: + return (1 * red0 + 4 * red1) / 5; + case 6: + return 0; + case 7: + return 0xff; + } + } + return 0; +} + +void DecompressBC4(std::span<const u8> input, Extent3D extent, std::span<u8> output) { + UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); + UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); + static constexpr u32 BLOCK_SIZE = 4; + size_t input_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { + for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { + u64 bits; + std::memcpy(&bits, &input[input_offset], sizeof(bits)); + input_offset += sizeof(bits); + + for (u32 y = 0; y < BLOCK_SIZE; ++y) { + for (u32 x = 0; x < BLOCK_SIZE; ++x) { + const u32 linear_z = slice; + const u32 linear_y = block_y * BLOCK_SIZE + y; + const u32 linear_x = block_x * BLOCK_SIZE + x; + const u32 offset_z = linear_z * extent.width * extent.height; + const u32 offset_y = linear_y * extent.width; + const u32 offset_x = linear_x; + const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; + const u32 color = DecompressBlock(bits, x, y); + output[output_offset + 0] = static_cast<u8>(color); + output[output_offset + 1] = 0; + output[output_offset + 2] = 0; + output[output_offset + 3] = 0xff; + } + } + } + } + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc4.h new file mode 100644 index 000000000..63fb23508 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc4.h @@ -0,0 +1,16 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <span> + +#include "common/common_types.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +void DecompressBC4(std::span<const u8> data, Extent3D extent, std::span<u8> output); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/descriptor_table.h b/src/video_core/texture_cache/descriptor_table.h new file mode 100644 index 000000000..3a03b786f --- /dev/null +++ b/src/video_core/texture_cache/descriptor_table.h @@ -0,0 +1,82 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <vector> + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/logging/log.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <typename Descriptor> +class DescriptorTable { +public: + explicit DescriptorTable(Tegra::MemoryManager& gpu_memory_) : gpu_memory{gpu_memory_} {} + + [[nodiscard]] bool Synchornize(GPUVAddr gpu_addr, u32 limit) { + [[likely]] if (current_gpu_addr == gpu_addr && current_limit == limit) { + return false; + } + Refresh(gpu_addr, limit); + return true; + } + + void Invalidate() noexcept { + std::ranges::fill(read_descriptors, 0); + } + + [[nodiscard]] std::pair<Descriptor, bool> Read(u32 index) { + DEBUG_ASSERT(index <= current_limit); + const GPUVAddr gpu_addr = current_gpu_addr + index * sizeof(Descriptor); + std::pair<Descriptor, bool> result; + gpu_memory.ReadBlockUnsafe(gpu_addr, &result.first, sizeof(Descriptor)); + if (IsDescriptorRead(index)) { + result.second = result.first != descriptors[index]; + } else { + MarkDescriptorAsRead(index); + result.second = true; + } + if (result.second) { + descriptors[index] = result.first; + } + return result; + } + + [[nodiscard]] u32 Limit() const noexcept { + return current_limit; + } + +private: + void Refresh(GPUVAddr gpu_addr, u32 limit) { + current_gpu_addr = gpu_addr; + current_limit = limit; + + const size_t num_descriptors = static_cast<size_t>(limit) + 1; + read_descriptors.clear(); + read_descriptors.resize(Common::DivCeil(num_descriptors, 64U), 0); + descriptors.resize(num_descriptors); + } + + void MarkDescriptorAsRead(u32 index) noexcept { + read_descriptors[index / 64] |= 1ULL << (index % 64); + } + + [[nodiscard]] bool IsDescriptorRead(u32 index) const noexcept { + return (read_descriptors[index / 64] & (1ULL << (index % 64))) != 0; + } + + Tegra::MemoryManager& gpu_memory; + GPUVAddr current_gpu_addr{}; + u32 current_limit{}; + std::vector<u64> read_descriptors; + std::vector<Descriptor> descriptors; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 7d5a75648..ddfb726fe 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <array> #include "common/common_types.h" #include "common/logging/log.h" #include "video_core/texture_cache/format_lookup_table.h" @@ -20,198 +19,207 @@ constexpr auto UNORM = ComponentType::UNORM; constexpr auto SINT = ComponentType::SINT; constexpr auto UINT = ComponentType::UINT; constexpr auto FLOAT = ComponentType::FLOAT; -constexpr bool C = false; // Normal color -constexpr bool S = true; // Srgb - -struct Table { - constexpr Table(TextureFormat texture_format, bool is_srgb, ComponentType red_component, - ComponentType green_component, ComponentType blue_component, - ComponentType alpha_component, PixelFormat pixel_format) - : texture_format{texture_format}, pixel_format{pixel_format}, red_component{red_component}, - green_component{green_component}, blue_component{blue_component}, - alpha_component{alpha_component}, is_srgb{is_srgb} {} - - TextureFormat texture_format; - PixelFormat pixel_format; - ComponentType red_component; - ComponentType green_component; - ComponentType blue_component; - ComponentType alpha_component; - bool is_srgb; -}; -constexpr std::array<Table, 86> DefinitionTable = {{ - {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_UNORM}, - {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::A8B8G8R8_SNORM}, - {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::A8B8G8R8_UINT}, - {TextureFormat::A8R8G8B8, C, SINT, SINT, SINT, SINT, PixelFormat::A8B8G8R8_SINT}, - {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_SRGB}, - - {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5_UNORM}, - - {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10_UNORM}, - {TextureFormat::A2B10G10R10, C, UINT, UINT, UINT, UINT, PixelFormat::A2B10G10R10_UINT}, - - {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5_UNORM}, - - {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A4B4G4R4_UNORM}, - - {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8_UNORM}, - {TextureFormat::R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8_SNORM}, - {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8_UINT}, - {TextureFormat::R8, C, SINT, SINT, SINT, SINT, PixelFormat::R8_SINT}, - - {TextureFormat::R8G8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8G8_UNORM}, - {TextureFormat::R8G8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8G8_SNORM}, - {TextureFormat::R8G8, C, UINT, UINT, UINT, UINT, PixelFormat::R8G8_UINT}, - {TextureFormat::R8G8, C, SINT, SINT, SINT, SINT, PixelFormat::R8G8_SINT}, - - {TextureFormat::R16G16B16A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16B16A16_SNORM}, - {TextureFormat::R16G16B16A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16B16A16_UNORM}, - {TextureFormat::R16G16B16A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16B16A16_FLOAT}, - {TextureFormat::R16G16B16A16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16B16A16_UINT}, - {TextureFormat::R16G16B16A16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16B16A16_SINT}, - - {TextureFormat::R16G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16_FLOAT}, - {TextureFormat::R16G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16_UNORM}, - {TextureFormat::R16G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16_SNORM}, - {TextureFormat::R16G16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16_UINT}, - {TextureFormat::R16G16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16_SINT}, - - {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16_FLOAT}, - {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16_UNORM}, - {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16_SNORM}, - {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16_UINT}, - {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16_SINT}, - - {TextureFormat::B10G11R11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::B10G11R11_FLOAT}, - - {TextureFormat::R32G32B32A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32A32_FLOAT}, - {TextureFormat::R32G32B32A32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32B32A32_UINT}, - {TextureFormat::R32G32B32A32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32B32A32_SINT}, - - {TextureFormat::R32G32B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32_FLOAT}, - - {TextureFormat::R32G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32_FLOAT}, - {TextureFormat::R32G32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32_UINT}, - {TextureFormat::R32G32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32_SINT}, - - {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32_FLOAT}, - {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32_UINT}, - {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32_SINT}, - - {TextureFormat::E5B9G9R9, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9_FLOAT}, - - {TextureFormat::D32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::D32_FLOAT}, - {TextureFormat::D16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::D16_UNORM}, - {TextureFormat::S8D24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, - {TextureFormat::R8G24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, - {TextureFormat::D32S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::D32_FLOAT_S8_UINT}, - - {TextureFormat::BC1_RGBA, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_UNORM}, - {TextureFormat::BC1_RGBA, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_SRGB}, - - {TextureFormat::BC2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_UNORM}, - {TextureFormat::BC2, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_SRGB}, - - {TextureFormat::BC3, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_UNORM}, - {TextureFormat::BC3, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_SRGB}, - - {TextureFormat::BC4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC4_UNORM}, - {TextureFormat::BC4, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC4_SNORM}, - - {TextureFormat::BC5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC5_UNORM}, - {TextureFormat::BC5, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC5_SNORM}, - - {TextureFormat::BC7, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_UNORM}, - {TextureFormat::BC7, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_SRGB}, - - {TextureFormat::BC6H_SFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SFLOAT}, - {TextureFormat::BC6H_UFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UFLOAT}, - - {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_UNORM}, - {TextureFormat::ASTC_2D_4X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_SRGB}, - - {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_UNORM}, - {TextureFormat::ASTC_2D_5X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_SRGB}, - - {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_UNORM}, - {TextureFormat::ASTC_2D_5X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_SRGB}, - - {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_UNORM}, - {TextureFormat::ASTC_2D_8X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_SRGB}, - - {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_UNORM}, - {TextureFormat::ASTC_2D_8X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_SRGB}, - - {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_UNORM}, - {TextureFormat::ASTC_2D_10X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_SRGB}, - - {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_UNORM}, - {TextureFormat::ASTC_2D_6X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_SRGB}, - - {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_UNORM}, - {TextureFormat::ASTC_2D_10X10, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_SRGB}, - - {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_UNORM}, - {TextureFormat::ASTC_2D_12X12, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_SRGB}, - - {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_UNORM}, - {TextureFormat::ASTC_2D_8X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_SRGB}, +constexpr bool LINEAR = false; +constexpr bool SRGB = true; + +constexpr u32 Hash(TextureFormat format, ComponentType red_component, ComponentType green_component, + ComponentType blue_component, ComponentType alpha_component, bool is_srgb) { + u32 hash = is_srgb ? 1 : 0; + hash |= static_cast<u32>(red_component) << 1; + hash |= static_cast<u32>(green_component) << 4; + hash |= static_cast<u32>(blue_component) << 7; + hash |= static_cast<u32>(alpha_component) << 10; + hash |= static_cast<u32>(format) << 13; + return hash; +} - {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_UNORM}, - {TextureFormat::ASTC_2D_6X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_SRGB}, -}}; +constexpr u32 Hash(TextureFormat format, ComponentType component, bool is_srgb = LINEAR) { + return Hash(format, component, component, component, component, is_srgb); +} } // Anonymous namespace -FormatLookupTable::FormatLookupTable() { - table.fill(static_cast<u8>(PixelFormat::Invalid)); - - for (const auto& entry : DefinitionTable) { - table[CalculateIndex(entry.texture_format, entry.is_srgb != 0, entry.red_component, - entry.green_component, entry.blue_component, entry.alpha_component)] = - static_cast<u8>(entry.pixel_format); - } -} - -PixelFormat FormatLookupTable::GetPixelFormat(TextureFormat format, bool is_srgb, - ComponentType red_component, - ComponentType green_component, - ComponentType blue_component, - ComponentType alpha_component) const noexcept { - const auto pixel_format = static_cast<PixelFormat>(table[CalculateIndex( - format, is_srgb, red_component, green_component, blue_component, alpha_component)]); - // [[likely]] - if (pixel_format != PixelFormat::Invalid) { - return pixel_format; +PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, ComponentType green, + ComponentType blue, ComponentType alpha, + bool is_srgb) noexcept { + switch (Hash(format, red, green, blue, alpha, is_srgb)) { + case Hash(TextureFormat::A8R8G8B8, UNORM): + return PixelFormat::A8B8G8R8_UNORM; + case Hash(TextureFormat::A8R8G8B8, SNORM): + return PixelFormat::A8B8G8R8_SNORM; + case Hash(TextureFormat::A8R8G8B8, UINT): + return PixelFormat::A8B8G8R8_UINT; + case Hash(TextureFormat::A8R8G8B8, SINT): + return PixelFormat::A8B8G8R8_SINT; + case Hash(TextureFormat::A8R8G8B8, UNORM, SRGB): + return PixelFormat::A8B8G8R8_SRGB; + case Hash(TextureFormat::B5G6R5, UNORM): + return PixelFormat::B5G6R5_UNORM; + case Hash(TextureFormat::A2B10G10R10, UNORM): + return PixelFormat::A2B10G10R10_UNORM; + case Hash(TextureFormat::A2B10G10R10, UINT): + return PixelFormat::A2B10G10R10_UINT; + case Hash(TextureFormat::A1B5G5R5, UNORM): + return PixelFormat::A1B5G5R5_UNORM; + case Hash(TextureFormat::A4B4G4R4, UNORM): + return PixelFormat::A4B4G4R4_UNORM; + case Hash(TextureFormat::R8, UNORM): + return PixelFormat::R8_UNORM; + case Hash(TextureFormat::R8, SNORM): + return PixelFormat::R8_SNORM; + case Hash(TextureFormat::R8, UINT): + return PixelFormat::R8_UINT; + case Hash(TextureFormat::R8, SINT): + return PixelFormat::R8_SINT; + case Hash(TextureFormat::R8G8, UNORM): + return PixelFormat::R8G8_UNORM; + case Hash(TextureFormat::R8G8, SNORM): + return PixelFormat::R8G8_SNORM; + case Hash(TextureFormat::R8G8, UINT): + return PixelFormat::R8G8_UINT; + case Hash(TextureFormat::R8G8, SINT): + return PixelFormat::R8G8_SINT; + case Hash(TextureFormat::R16G16B16A16, FLOAT): + return PixelFormat::R16G16B16A16_FLOAT; + case Hash(TextureFormat::R16G16B16A16, UNORM): + return PixelFormat::R16G16B16A16_UNORM; + case Hash(TextureFormat::R16G16B16A16, SNORM): + return PixelFormat::R16G16B16A16_SNORM; + case Hash(TextureFormat::R16G16B16A16, UINT): + return PixelFormat::R16G16B16A16_UINT; + case Hash(TextureFormat::R16G16B16A16, SINT): + return PixelFormat::R16G16B16A16_SINT; + case Hash(TextureFormat::R16G16, FLOAT): + return PixelFormat::R16G16_FLOAT; + case Hash(TextureFormat::R16G16, UNORM): + return PixelFormat::R16G16_UNORM; + case Hash(TextureFormat::R16G16, SNORM): + return PixelFormat::R16G16_SNORM; + case Hash(TextureFormat::R16G16, UINT): + return PixelFormat::R16G16_UINT; + case Hash(TextureFormat::R16G16, SINT): + return PixelFormat::R16G16_SINT; + case Hash(TextureFormat::R16, FLOAT): + return PixelFormat::R16_FLOAT; + case Hash(TextureFormat::R16, UNORM): + return PixelFormat::R16_UNORM; + case Hash(TextureFormat::R16, SNORM): + return PixelFormat::R16_SNORM; + case Hash(TextureFormat::R16, UINT): + return PixelFormat::R16_UINT; + case Hash(TextureFormat::R16, SINT): + return PixelFormat::R16_SINT; + case Hash(TextureFormat::B10G11R11, FLOAT): + return PixelFormat::B10G11R11_FLOAT; + case Hash(TextureFormat::R32G32B32A32, FLOAT): + return PixelFormat::R32G32B32A32_FLOAT; + case Hash(TextureFormat::R32G32B32A32, UINT): + return PixelFormat::R32G32B32A32_UINT; + case Hash(TextureFormat::R32G32B32A32, SINT): + return PixelFormat::R32G32B32A32_SINT; + case Hash(TextureFormat::R32G32B32, FLOAT): + return PixelFormat::R32G32B32_FLOAT; + case Hash(TextureFormat::R32G32, FLOAT): + return PixelFormat::R32G32_FLOAT; + case Hash(TextureFormat::R32G32, UINT): + return PixelFormat::R32G32_UINT; + case Hash(TextureFormat::R32G32, SINT): + return PixelFormat::R32G32_SINT; + case Hash(TextureFormat::R32, FLOAT): + return PixelFormat::R32_FLOAT; + case Hash(TextureFormat::R32, UINT): + return PixelFormat::R32_UINT; + case Hash(TextureFormat::R32, SINT): + return PixelFormat::R32_SINT; + case Hash(TextureFormat::E5B9G9R9, FLOAT): + return PixelFormat::E5B9G9R9_FLOAT; + case Hash(TextureFormat::D32, FLOAT): + return PixelFormat::D32_FLOAT; + case Hash(TextureFormat::D16, UNORM): + return PixelFormat::D16_UNORM; + case Hash(TextureFormat::S8D24, UINT, UNORM, UNORM, UNORM, LINEAR): + return PixelFormat::S8_UINT_D24_UNORM; + case Hash(TextureFormat::R8G24, UINT, UNORM, UNORM, UNORM, LINEAR): + return PixelFormat::S8_UINT_D24_UNORM; + case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR): + return PixelFormat::D32_FLOAT_S8_UINT; + case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR): + return PixelFormat::BC1_RGBA_UNORM; + case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB): + return PixelFormat::BC1_RGBA_SRGB; + case Hash(TextureFormat::BC2, UNORM, LINEAR): + return PixelFormat::BC2_UNORM; + case Hash(TextureFormat::BC2, UNORM, SRGB): + return PixelFormat::BC2_SRGB; + case Hash(TextureFormat::BC3, UNORM, LINEAR): + return PixelFormat::BC3_UNORM; + case Hash(TextureFormat::BC3, UNORM, SRGB): + return PixelFormat::BC3_SRGB; + case Hash(TextureFormat::BC4, UNORM): + return PixelFormat::BC4_UNORM; + case Hash(TextureFormat::BC4, SNORM): + return PixelFormat::BC4_SNORM; + case Hash(TextureFormat::BC5, UNORM): + return PixelFormat::BC5_UNORM; + case Hash(TextureFormat::BC5, SNORM): + return PixelFormat::BC5_SNORM; + case Hash(TextureFormat::BC7, UNORM, LINEAR): + return PixelFormat::BC7_UNORM; + case Hash(TextureFormat::BC7, UNORM, SRGB): + return PixelFormat::BC7_SRGB; + case Hash(TextureFormat::BC6H_SFLOAT, FLOAT): + return PixelFormat::BC6H_SFLOAT; + case Hash(TextureFormat::BC6H_UFLOAT, FLOAT): + return PixelFormat::BC6H_UFLOAT; + case Hash(TextureFormat::ASTC_2D_4X4, UNORM, LINEAR): + return PixelFormat::ASTC_2D_4X4_UNORM; + case Hash(TextureFormat::ASTC_2D_4X4, UNORM, SRGB): + return PixelFormat::ASTC_2D_4X4_SRGB; + case Hash(TextureFormat::ASTC_2D_5X4, UNORM, LINEAR): + return PixelFormat::ASTC_2D_5X4_UNORM; + case Hash(TextureFormat::ASTC_2D_5X4, UNORM, SRGB): + return PixelFormat::ASTC_2D_5X4_SRGB; + case Hash(TextureFormat::ASTC_2D_5X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_5X5_UNORM; + case Hash(TextureFormat::ASTC_2D_5X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_5X5_SRGB; + case Hash(TextureFormat::ASTC_2D_8X8, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X8_UNORM; + case Hash(TextureFormat::ASTC_2D_8X8, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X8_SRGB; + case Hash(TextureFormat::ASTC_2D_8X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X5_UNORM; + case Hash(TextureFormat::ASTC_2D_8X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X5_SRGB; + case Hash(TextureFormat::ASTC_2D_10X8, UNORM, LINEAR): + return PixelFormat::ASTC_2D_10X8_UNORM; + case Hash(TextureFormat::ASTC_2D_10X8, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X8_SRGB; + case Hash(TextureFormat::ASTC_2D_6X6, UNORM, LINEAR): + return PixelFormat::ASTC_2D_6X6_UNORM; + case Hash(TextureFormat::ASTC_2D_6X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_6X6_SRGB; + case Hash(TextureFormat::ASTC_2D_10X10, UNORM, LINEAR): + return PixelFormat::ASTC_2D_10X10_UNORM; + case Hash(TextureFormat::ASTC_2D_10X10, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X10_SRGB; + case Hash(TextureFormat::ASTC_2D_12X12, UNORM, LINEAR): + return PixelFormat::ASTC_2D_12X12_UNORM; + case Hash(TextureFormat::ASTC_2D_12X12, UNORM, SRGB): + return PixelFormat::ASTC_2D_12X12_SRGB; + case Hash(TextureFormat::ASTC_2D_8X6, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X6_UNORM; + case Hash(TextureFormat::ASTC_2D_8X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X6_SRGB; + case Hash(TextureFormat::ASTC_2D_6X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_6X5_UNORM; + case Hash(TextureFormat::ASTC_2D_6X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_6X5_SRGB; } UNIMPLEMENTED_MSG("texture format={} srgb={} components={{{} {} {} {}}}", - static_cast<int>(format), is_srgb, static_cast<int>(red_component), - static_cast<int>(green_component), static_cast<int>(blue_component), - static_cast<int>(alpha_component)); + static_cast<int>(format), is_srgb, static_cast<int>(red), + static_cast<int>(green), static_cast<int>(blue), static_cast<int>(alpha)); return PixelFormat::A8B8G8R8_UNORM; } -void FormatLookupTable::Set(TextureFormat format, bool is_srgb, ComponentType red_component, - ComponentType green_component, ComponentType blue_component, - ComponentType alpha_component, PixelFormat pixel_format) {} - -std::size_t FormatLookupTable::CalculateIndex(TextureFormat format, bool is_srgb, - ComponentType red_component, - ComponentType green_component, - ComponentType blue_component, - ComponentType alpha_component) noexcept { - const auto format_index = static_cast<std::size_t>(format); - const auto red_index = static_cast<std::size_t>(red_component); - const auto green_index = static_cast<std::size_t>(green_component); - const auto blue_index = static_cast<std::size_t>(blue_component); - const auto alpha_index = static_cast<std::size_t>(alpha_component); - const std::size_t srgb_index = is_srgb ? 1 : 0; - - return format_index * PerFormat + - srgb_index * PerComponent * PerComponent * PerComponent * PerComponent + - alpha_index * PerComponent * PerComponent * PerComponent + - blue_index * PerComponent * PerComponent + green_index * PerComponent + red_index; -} - } // namespace VideoCommon diff --git a/src/video_core/texture_cache/format_lookup_table.h b/src/video_core/texture_cache/format_lookup_table.h index aa77e0a5a..729533999 100644 --- a/src/video_core/texture_cache/format_lookup_table.h +++ b/src/video_core/texture_cache/format_lookup_table.h @@ -4,48 +4,14 @@ #pragma once -#include <array> -#include <limits> #include "video_core/surface.h" #include "video_core/textures/texture.h" namespace VideoCommon { -class FormatLookupTable { -public: - explicit FormatLookupTable(); - - VideoCore::Surface::PixelFormat GetPixelFormat( - Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component) const noexcept; - -private: - static_assert(VideoCore::Surface::MaxPixelFormat <= std::numeric_limits<u8>::max()); - - static constexpr std::size_t NumTextureFormats = 128; - - static constexpr std::size_t PerComponent = 8; - static constexpr std::size_t PerComponents2 = PerComponent * PerComponent; - static constexpr std::size_t PerComponents3 = PerComponents2 * PerComponent; - static constexpr std::size_t PerComponents4 = PerComponents3 * PerComponent; - static constexpr std::size_t PerFormat = PerComponents4 * 2; - - static std::size_t CalculateIndex(Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, - Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component) noexcept; - - void Set(Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, - Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component, - VideoCore::Surface::PixelFormat pixel_format); - - std::array<u8, NumTextureFormats * PerFormat> table; -}; +VideoCore::Surface::PixelFormat PixelFormatFromTextureInfo( + Tegra::Texture::TextureFormat format, Tegra::Texture::ComponentType red_component, + Tegra::Texture::ComponentType green_component, Tegra::Texture::ComponentType blue_component, + Tegra::Texture::ComponentType alpha_component, bool is_srgb) noexcept; } // namespace VideoCommon diff --git a/src/video_core/texture_cache/formatter.cpp b/src/video_core/texture_cache/formatter.cpp new file mode 100644 index 000000000..d10ba4ccd --- /dev/null +++ b/src/video_core/texture_cache/formatter.cpp @@ -0,0 +1,95 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <string> + +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/render_targets.h" + +namespace VideoCommon { + +std::string Name(const ImageBase& image) { + const GPUVAddr gpu_addr = image.gpu_addr; + const ImageInfo& info = image.info; + const u32 width = info.size.width; + const u32 height = info.size.height; + const u32 depth = info.size.depth; + const u32 num_layers = image.info.resources.layers; + const u32 num_levels = image.info.resources.levels; + std::string resource; + if (num_layers > 1) { + resource += fmt::format(":L{}", num_layers); + } + if (num_levels > 1) { + resource += fmt::format(":M{}", num_levels); + } + switch (image.info.type) { + case ImageType::e1D: + return fmt::format("Image 1D 0x{:x} {}{}", gpu_addr, width, resource); + case ImageType::e2D: + return fmt::format("Image 2D 0x{:x} {}x{}{}", gpu_addr, width, height, resource); + case ImageType::e3D: + return fmt::format("Image 2D 0x{:x} {}x{}x{}{}", gpu_addr, width, height, depth, resource); + case ImageType::Linear: + return fmt::format("Image Linear 0x{:x} {}x{}", gpu_addr, width, height); + case ImageType::Buffer: + return fmt::format("Buffer 0x{:x} {}", image.gpu_addr, image.info.size.width); + } + return "Invalid"; +} + +std::string Name(const ImageViewBase& image_view, std::optional<ImageViewType> type) { + const u32 width = image_view.size.width; + const u32 height = image_view.size.height; + const u32 depth = image_view.size.depth; + const u32 num_levels = image_view.range.extent.levels; + const u32 num_layers = image_view.range.extent.layers; + + const std::string level = num_levels > 1 ? fmt::format(":{}", num_levels) : ""; + switch (type.value_or(image_view.type)) { + case ImageViewType::e1D: + return fmt::format("ImageView 1D {}{}", width, level); + case ImageViewType::e2D: + return fmt::format("ImageView 2D {}x{}{}", width, height, level); + case ImageViewType::Cube: + return fmt::format("ImageView Cube {}x{}{}", width, height, level); + case ImageViewType::e3D: + return fmt::format("ImageView 3D {}x{}x{}{}", width, height, depth, level); + case ImageViewType::e1DArray: + return fmt::format("ImageView 1DArray {}{}|{}", width, level, num_layers); + case ImageViewType::e2DArray: + return fmt::format("ImageView 2DArray {}x{}{}|{}", width, height, level, num_layers); + case ImageViewType::CubeArray: + return fmt::format("ImageView CubeArray {}x{}{}|{}", width, height, level, num_layers); + case ImageViewType::Rect: + return fmt::format("ImageView Rect {}x{}{}", width, height, level); + case ImageViewType::Buffer: + return fmt::format("BufferView {}", width); + } + return "Invalid"; +} + +std::string Name(const RenderTargets& render_targets) { + std::string_view debug_prefix; + const auto num_color = std::ranges::count_if( + render_targets.color_buffer_ids, [](ImageViewId id) { return static_cast<bool>(id); }); + if (render_targets.depth_buffer_id) { + debug_prefix = num_color > 0 ? "R" : "Z"; + } else { + debug_prefix = num_color > 0 ? "C" : "X"; + } + const Extent2D size = render_targets.size; + if (num_color > 0) { + return fmt::format("Framebuffer {}{} {}x{}", debug_prefix, num_color, size.width, + size.height); + } else { + return fmt::format("Framebuffer {} {}x{}", debug_prefix, size.width, size.height); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h new file mode 100644 index 000000000..a48413983 --- /dev/null +++ b/src/video_core/texture_cache/formatter.h @@ -0,0 +1,263 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> + +#include <fmt/format.h> + +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +template <> +struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::string_view> { + template <typename FormatContext> + auto format(VideoCore::Surface::PixelFormat format, FormatContext& ctx) { + using VideoCore::Surface::PixelFormat; + const string_view name = [format] { + switch (format) { + case PixelFormat::A8B8G8R8_UNORM: + return "A8B8G8R8_UNORM"; + case PixelFormat::A8B8G8R8_SNORM: + return "A8B8G8R8_SNORM"; + case PixelFormat::A8B8G8R8_SINT: + return "A8B8G8R8_SINT"; + case PixelFormat::A8B8G8R8_UINT: + return "A8B8G8R8_UINT"; + case PixelFormat::R5G6B5_UNORM: + return "R5G6B5_UNORM"; + case PixelFormat::B5G6R5_UNORM: + return "B5G6R5_UNORM"; + case PixelFormat::A1R5G5B5_UNORM: + return "A1R5G5B5_UNORM"; + case PixelFormat::A2B10G10R10_UNORM: + return "A2B10G10R10_UNORM"; + case PixelFormat::A2B10G10R10_UINT: + return "A2B10G10R10_UINT"; + case PixelFormat::A1B5G5R5_UNORM: + return "A1B5G5R5_UNORM"; + case PixelFormat::R8_UNORM: + return "R8_UNORM"; + case PixelFormat::R8_SNORM: + return "R8_SNORM"; + case PixelFormat::R8_SINT: + return "R8_SINT"; + case PixelFormat::R8_UINT: + return "R8_UINT"; + case PixelFormat::R16G16B16A16_FLOAT: + return "R16G16B16A16_FLOAT"; + case PixelFormat::R16G16B16A16_UNORM: + return "R16G16B16A16_UNORM"; + case PixelFormat::R16G16B16A16_SNORM: + return "R16G16B16A16_SNORM"; + case PixelFormat::R16G16B16A16_SINT: + return "R16G16B16A16_SINT"; + case PixelFormat::R16G16B16A16_UINT: + return "R16G16B16A16_UINT"; + case PixelFormat::B10G11R11_FLOAT: + return "B10G11R11_FLOAT"; + case PixelFormat::R32G32B32A32_UINT: + return "R32G32B32A32_UINT"; + case PixelFormat::BC1_RGBA_UNORM: + return "BC1_RGBA_UNORM"; + case PixelFormat::BC2_UNORM: + return "BC2_UNORM"; + case PixelFormat::BC3_UNORM: + return "BC3_UNORM"; + case PixelFormat::BC4_UNORM: + return "BC4_UNORM"; + case PixelFormat::BC4_SNORM: + return "BC4_SNORM"; + case PixelFormat::BC5_UNORM: + return "BC5_UNORM"; + case PixelFormat::BC5_SNORM: + return "BC5_SNORM"; + case PixelFormat::BC7_UNORM: + return "BC7_UNORM"; + case PixelFormat::BC6H_UFLOAT: + return "BC6H_UFLOAT"; + case PixelFormat::BC6H_SFLOAT: + return "BC6H_SFLOAT"; + case PixelFormat::ASTC_2D_4X4_UNORM: + return "ASTC_2D_4X4_UNORM"; + case PixelFormat::B8G8R8A8_UNORM: + return "B8G8R8A8_UNORM"; + case PixelFormat::R32G32B32A32_FLOAT: + return "R32G32B32A32_FLOAT"; + case PixelFormat::R32G32B32A32_SINT: + return "R32G32B32A32_SINT"; + case PixelFormat::R32G32_FLOAT: + return "R32G32_FLOAT"; + case PixelFormat::R32G32_SINT: + return "R32G32_SINT"; + case PixelFormat::R32_FLOAT: + return "R32_FLOAT"; + case PixelFormat::R16_FLOAT: + return "R16_FLOAT"; + case PixelFormat::R16_UNORM: + return "R16_UNORM"; + case PixelFormat::R16_SNORM: + return "R16_SNORM"; + case PixelFormat::R16_UINT: + return "R16_UINT"; + case PixelFormat::R16_SINT: + return "R16_SINT"; + case PixelFormat::R16G16_UNORM: + return "R16G16_UNORM"; + case PixelFormat::R16G16_FLOAT: + return "R16G16_FLOAT"; + case PixelFormat::R16G16_UINT: + return "R16G16_UINT"; + case PixelFormat::R16G16_SINT: + return "R16G16_SINT"; + case PixelFormat::R16G16_SNORM: + return "R16G16_SNORM"; + case PixelFormat::R32G32B32_FLOAT: + return "R32G32B32_FLOAT"; + case PixelFormat::A8B8G8R8_SRGB: + return "A8B8G8R8_SRGB"; + case PixelFormat::R8G8_UNORM: + return "R8G8_UNORM"; + case PixelFormat::R8G8_SNORM: + return "R8G8_SNORM"; + case PixelFormat::R8G8_SINT: + return "R8G8_SINT"; + case PixelFormat::R8G8_UINT: + return "R8G8_UINT"; + case PixelFormat::R32G32_UINT: + return "R32G32_UINT"; + case PixelFormat::R16G16B16X16_FLOAT: + return "R16G16B16X16_FLOAT"; + case PixelFormat::R32_UINT: + return "R32_UINT"; + case PixelFormat::R32_SINT: + return "R32_SINT"; + case PixelFormat::ASTC_2D_8X8_UNORM: + return "ASTC_2D_8X8_UNORM"; + case PixelFormat::ASTC_2D_8X5_UNORM: + return "ASTC_2D_8X5_UNORM"; + case PixelFormat::ASTC_2D_5X4_UNORM: + return "ASTC_2D_5X4_UNORM"; + case PixelFormat::B8G8R8A8_SRGB: + return "B8G8R8A8_SRGB"; + case PixelFormat::BC1_RGBA_SRGB: + return "BC1_RGBA_SRGB"; + case PixelFormat::BC2_SRGB: + return "BC2_SRGB"; + case PixelFormat::BC3_SRGB: + return "BC3_SRGB"; + case PixelFormat::BC7_SRGB: + return "BC7_SRGB"; + case PixelFormat::A4B4G4R4_UNORM: + return "A4B4G4R4_UNORM"; + case PixelFormat::ASTC_2D_4X4_SRGB: + return "ASTC_2D_4X4_SRGB"; + case PixelFormat::ASTC_2D_8X8_SRGB: + return "ASTC_2D_8X8_SRGB"; + case PixelFormat::ASTC_2D_8X5_SRGB: + return "ASTC_2D_8X5_SRGB"; + case PixelFormat::ASTC_2D_5X4_SRGB: + return "ASTC_2D_5X4_SRGB"; + case PixelFormat::ASTC_2D_5X5_UNORM: + return "ASTC_2D_5X5_UNORM"; + case PixelFormat::ASTC_2D_5X5_SRGB: + return "ASTC_2D_5X5_SRGB"; + case PixelFormat::ASTC_2D_10X8_UNORM: + return "ASTC_2D_10X8_UNORM"; + case PixelFormat::ASTC_2D_10X8_SRGB: + return "ASTC_2D_10X8_SRGB"; + case PixelFormat::ASTC_2D_6X6_UNORM: + return "ASTC_2D_6X6_UNORM"; + case PixelFormat::ASTC_2D_6X6_SRGB: + return "ASTC_2D_6X6_SRGB"; + case PixelFormat::ASTC_2D_10X10_UNORM: + return "ASTC_2D_10X10_UNORM"; + case PixelFormat::ASTC_2D_10X10_SRGB: + return "ASTC_2D_10X10_SRGB"; + case PixelFormat::ASTC_2D_12X12_UNORM: + return "ASTC_2D_12X12_UNORM"; + case PixelFormat::ASTC_2D_12X12_SRGB: + return "ASTC_2D_12X12_SRGB"; + case PixelFormat::ASTC_2D_8X6_UNORM: + return "ASTC_2D_8X6_UNORM"; + case PixelFormat::ASTC_2D_8X6_SRGB: + return "ASTC_2D_8X6_SRGB"; + case PixelFormat::ASTC_2D_6X5_UNORM: + return "ASTC_2D_6X5_UNORM"; + case PixelFormat::ASTC_2D_6X5_SRGB: + return "ASTC_2D_6X5_SRGB"; + case PixelFormat::E5B9G9R9_FLOAT: + return "E5B9G9R9_FLOAT"; + case PixelFormat::D32_FLOAT: + return "D32_FLOAT"; + case PixelFormat::D16_UNORM: + return "D16_UNORM"; + case PixelFormat::D24_UNORM_S8_UINT: + return "D24_UNORM_S8_UINT"; + case PixelFormat::S8_UINT_D24_UNORM: + return "S8_UINT_D24_UNORM"; + case PixelFormat::D32_FLOAT_S8_UINT: + return "D32_FLOAT_S8_UINT"; + case PixelFormat::MaxDepthStencilFormat: + case PixelFormat::Invalid: + return "Invalid"; + } + return "Invalid"; + }(); + return formatter<string_view>::format(name, ctx); + } +}; + +template <> +struct fmt::formatter<VideoCommon::ImageType> : fmt::formatter<fmt::string_view> { + template <typename FormatContext> + auto format(VideoCommon::ImageType type, FormatContext& ctx) { + const string_view name = [type] { + using VideoCommon::ImageType; + switch (type) { + case ImageType::e1D: + return "1D"; + case ImageType::e2D: + return "2D"; + case ImageType::e3D: + return "3D"; + case ImageType::Linear: + return "Linear"; + case ImageType::Buffer: + return "Buffer"; + } + return "Invalid"; + }(); + return formatter<string_view>::format(name, ctx); + } +}; + +template <> +struct fmt::formatter<VideoCommon::Extent3D> { + constexpr auto parse(fmt::format_parse_context& ctx) { + return ctx.begin(); + } + + template <typename FormatContext> + auto format(const VideoCommon::Extent3D& extent, FormatContext& ctx) { + return fmt::format_to(ctx.out(), "{{{}, {}, {}}}", extent.width, extent.height, + extent.depth); + } +}; + +namespace VideoCommon { + +struct ImageBase; +struct ImageViewBase; +struct RenderTargets; + +[[nodiscard]] std::string Name(const ImageBase& image); + +[[nodiscard]] std::string Name(const ImageViewBase& image_view, + std::optional<ImageViewType> type = std::nullopt); + +[[nodiscard]] std::string Name(const RenderTargets& render_targets); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp new file mode 100644 index 000000000..959b3f115 --- /dev/null +++ b/src/video_core/texture_cache/image_base.cpp @@ -0,0 +1,218 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <optional> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/util.h" + +namespace VideoCommon { + +using VideoCore::Surface::DefaultBlockHeight; +using VideoCore::Surface::DefaultBlockWidth; + +namespace { +/// Returns the base layer and mip level offset +[[nodiscard]] std::pair<s32, s32> LayerMipOffset(s32 diff, u32 layer_stride) { + if (layer_stride == 0) { + return {0, diff}; + } else { + return {diff / layer_stride, diff % layer_stride}; + } +} + +[[nodiscard]] bool ValidateLayers(const SubresourceLayers& layers, const ImageInfo& info) { + return layers.base_level < info.resources.levels && + layers.base_layer + layers.num_layers <= info.resources.layers; +} + +[[nodiscard]] bool ValidateCopy(const ImageCopy& copy, const ImageInfo& dst, const ImageInfo& src) { + const Extent3D src_size = MipSize(src.size, copy.src_subresource.base_level); + const Extent3D dst_size = MipSize(dst.size, copy.dst_subresource.base_level); + if (!ValidateLayers(copy.src_subresource, src)) { + return false; + } + if (!ValidateLayers(copy.dst_subresource, dst)) { + return false; + } + if (copy.src_offset.x + copy.extent.width > src_size.width || + copy.src_offset.y + copy.extent.height > src_size.height || + copy.src_offset.z + copy.extent.depth > src_size.depth) { + return false; + } + if (copy.dst_offset.x + copy.extent.width > dst_size.width || + copy.dst_offset.y + copy.extent.height > dst_size.height || + copy.dst_offset.z + copy.extent.depth > dst_size.depth) { + return false; + } + return true; +} +} // Anonymous namespace + +ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) + : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, + unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, + converted_size_bytes{CalculateConvertedSizeBytes(info)}, gpu_addr{gpu_addr_}, + cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, + mip_level_offsets{CalculateMipLevelOffsets(info)} { + if (info.type == ImageType::e3D) { + slice_offsets = CalculateSliceOffsets(info); + slice_subresources = CalculateSliceSubresources(info); + } +} + +std::optional<SubresourceBase> ImageBase::TryFindBase(GPUVAddr other_addr) const noexcept { + if (other_addr < gpu_addr) { + // Subresource address can't be lower than the base + return std::nullopt; + } + const u32 diff = static_cast<u32>(other_addr - gpu_addr); + if (diff > guest_size_bytes) { + // This can happen when two CPU addresses are used for different GPU addresses + return std::nullopt; + } + if (info.type != ImageType::e3D) { + const auto [layer, mip_offset] = LayerMipOffset(diff, info.layer_stride); + const auto end = mip_level_offsets.begin() + info.resources.levels; + const auto it = std::find(mip_level_offsets.begin(), end, mip_offset); + if (layer > info.resources.layers || it == end) { + return std::nullopt; + } + return SubresourceBase{ + .level = static_cast<s32>(std::distance(mip_level_offsets.begin(), it)), + .layer = layer, + }; + } else { + // TODO: Consider using binary_search after a threshold + const auto it = std::ranges::find(slice_offsets, diff); + if (it == slice_offsets.cend()) { + return std::nullopt; + } + return slice_subresources[std::distance(slice_offsets.begin(), it)]; + } +} + +ImageViewId ImageBase::FindView(const ImageViewInfo& view_info) const noexcept { + const auto it = std::ranges::find(image_view_infos, view_info); + if (it == image_view_infos.end()) { + return ImageViewId{}; + } + return image_view_ids[std::distance(image_view_infos.begin(), it)]; +} + +void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id) { + image_view_infos.push_back(view_info); + image_view_ids.push_back(image_view_id); +} + +void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { + static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; + ASSERT(lhs.info.type == rhs.info.type); + std::optional<SubresourceBase> base; + if (lhs.info.type == ImageType::Linear) { + base = SubresourceBase{.level = 0, .layer = 0}; + } else { + // We are passing relaxed formats as an option, having broken views or not won't matter + static constexpr bool broken_views = false; + base = FindSubresource(rhs.info, lhs, rhs.gpu_addr, OPTIONS, broken_views); + } + if (!base) { + LOG_ERROR(HW_GPU, "Image alias should have been flipped"); + return; + } + const PixelFormat lhs_format = lhs.info.format; + const PixelFormat rhs_format = rhs.info.format; + const Extent2D lhs_block{ + .width = DefaultBlockWidth(lhs_format), + .height = DefaultBlockHeight(lhs_format), + }; + const Extent2D rhs_block{ + .width = DefaultBlockWidth(rhs_format), + .height = DefaultBlockHeight(rhs_format), + }; + const bool is_lhs_compressed = lhs_block.width > 1 || lhs_block.height > 1; + const bool is_rhs_compressed = rhs_block.width > 1 || rhs_block.height > 1; + if (is_lhs_compressed && is_rhs_compressed) { + LOG_ERROR(HW_GPU, "Compressed to compressed image aliasing is not implemented"); + return; + } + const s32 lhs_mips = lhs.info.resources.levels; + const s32 rhs_mips = rhs.info.resources.levels; + const s32 num_mips = std::min(lhs_mips - base->level, rhs_mips); + AliasedImage lhs_alias; + AliasedImage rhs_alias; + lhs_alias.id = rhs_id; + rhs_alias.id = lhs_id; + lhs_alias.copies.reserve(num_mips); + rhs_alias.copies.reserve(num_mips); + for (s32 mip_level = 0; mip_level < num_mips; ++mip_level) { + Extent3D lhs_size = MipSize(lhs.info.size, base->level + mip_level); + Extent3D rhs_size = MipSize(rhs.info.size, mip_level); + if (is_lhs_compressed) { + lhs_size.width /= lhs_block.width; + lhs_size.height /= lhs_block.height; + } + if (is_rhs_compressed) { + rhs_size.width /= rhs_block.width; + rhs_size.height /= rhs_block.height; + } + const Extent3D copy_size{ + .width = std::min(lhs_size.width, rhs_size.width), + .height = std::min(lhs_size.height, rhs_size.height), + .depth = std::min(lhs_size.depth, rhs_size.depth), + }; + if (copy_size.width == 0 || copy_size.height == 0) { + LOG_WARNING(HW_GPU, "Copy size is smaller than block size. Mip cannot be aliased."); + continue; + } + const bool is_lhs_3d = lhs.info.type == ImageType::e3D; + const bool is_rhs_3d = rhs.info.type == ImageType::e3D; + const Offset3D lhs_offset{0, 0, 0}; + const Offset3D rhs_offset{0, 0, is_rhs_3d ? base->layer : 0}; + const s32 lhs_layers = is_lhs_3d ? 1 : lhs.info.resources.layers - base->layer; + const s32 rhs_layers = is_rhs_3d ? 1 : rhs.info.resources.layers; + const s32 num_layers = std::min(lhs_layers, rhs_layers); + const SubresourceLayers lhs_subresource{ + .base_level = mip_level, + .base_layer = 0, + .num_layers = num_layers, + }; + const SubresourceLayers rhs_subresource{ + .base_level = base->level + mip_level, + .base_layer = is_rhs_3d ? 0 : base->layer, + .num_layers = num_layers, + }; + [[maybe_unused]] const ImageCopy& to_lhs_copy = lhs_alias.copies.emplace_back(ImageCopy{ + .src_subresource = lhs_subresource, + .dst_subresource = rhs_subresource, + .src_offset = lhs_offset, + .dst_offset = rhs_offset, + .extent = copy_size, + }); + [[maybe_unused]] const ImageCopy& to_rhs_copy = rhs_alias.copies.emplace_back(ImageCopy{ + .src_subresource = rhs_subresource, + .dst_subresource = lhs_subresource, + .src_offset = rhs_offset, + .dst_offset = lhs_offset, + .extent = copy_size, + }); + ASSERT_MSG(ValidateCopy(to_lhs_copy, lhs.info, rhs.info), "Invalid RHS to LHS copy"); + ASSERT_MSG(ValidateCopy(to_rhs_copy, rhs.info, lhs.info), "Invalid LHS to RHS copy"); + } + ASSERT(lhs_alias.copies.empty() == rhs_alias.copies.empty()); + if (lhs_alias.copies.empty()) { + return; + } + lhs.aliased_images.push_back(std::move(lhs_alias)); + rhs.aliased_images.push_back(std::move(rhs_alias)); +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h new file mode 100644 index 000000000..b7f3b7e43 --- /dev/null +++ b/src/video_core/texture_cache/image_base.h @@ -0,0 +1,83 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <optional> +#include <vector> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +enum class ImageFlagBits : u32 { + AcceleratedUpload = 1 << 0, ///< Upload can be accelerated in the GPU + Converted = 1 << 1, ///< Guest format is not supported natively and it has to be converted + CpuModified = 1 << 2, ///< Contents have been modified from the CPU + GpuModified = 1 << 3, ///< Contents have been modified from the GPU + Tracked = 1 << 4, ///< Writes and reads are being hooked from the CPU JIT + Strong = 1 << 5, ///< Exists in the image table, the dimensions are can be trusted + Registered = 1 << 6, ///< True when the image is registered + Picked = 1 << 7, ///< Temporary flag to mark the image as picked +}; +DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) + +struct ImageViewInfo; + +struct AliasedImage { + std::vector<ImageCopy> copies; + ImageId id; +}; + +struct ImageBase { + explicit ImageBase(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + + [[nodiscard]] std::optional<SubresourceBase> TryFindBase(GPUVAddr other_addr) const noexcept; + + [[nodiscard]] ImageViewId FindView(const ImageViewInfo& view_info) const noexcept; + + void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); + + [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { + const VAddr overlap_end = overlap_cpu_addr + overlap_size; + return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; + } + + ImageInfo info; + + u32 guest_size_bytes = 0; + u32 unswizzled_size_bytes = 0; + u32 converted_size_bytes = 0; + ImageFlagBits flags = ImageFlagBits::CpuModified; + + GPUVAddr gpu_addr = 0; + VAddr cpu_addr = 0; + VAddr cpu_addr_end = 0; + + u64 modification_tick = 0; + u64 frame_tick = 0; + + std::array<u32, MAX_MIP_LEVELS> mip_level_offsets{}; + + std::vector<ImageViewInfo> image_view_infos; + std::vector<ImageViewId> image_view_ids; + + std::vector<u32> slice_offsets; + std::vector<SubresourceBase> slice_subresources; + + std::vector<AliasedImage> aliased_images; +}; + +struct ImageAllocBase { + std::vector<ImageId> images; +}; + +void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp new file mode 100644 index 000000000..64fd7010a --- /dev/null +++ b/src/video_core/texture_cache/image_info.cpp @@ -0,0 +1,189 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +ImageInfo::ImageInfo(const TICEntry& config) noexcept { + format = PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, + config.a_type, config.srgb_conversion); + num_samples = NumSamples(config.msaa_mode); + resources.levels = config.max_mip_level + 1; + if (config.IsPitchLinear()) { + pitch = config.Pitch(); + } else if (config.IsBlockLinear()) { + block = Extent3D{ + .width = config.block_width, + .height = config.block_height, + .depth = config.block_depth, + }; + } + tile_width_spacing = config.tile_width_spacing; + if (config.texture_type != TextureType::Texture2D && + config.texture_type != TextureType::Texture2DNoMipmap) { + ASSERT(!config.IsPitchLinear()); + } + switch (config.texture_type) { + case TextureType::Texture1D: + ASSERT(config.BaseLayer() == 0); + type = ImageType::e1D; + size.width = config.Width(); + break; + case TextureType::Texture1DArray: + UNIMPLEMENTED_IF(config.BaseLayer() != 0); + type = ImageType::e1D; + size.width = config.Width(); + resources.layers = config.Depth(); + break; + case TextureType::Texture2D: + case TextureType::Texture2DNoMipmap: + ASSERT(config.Depth() == 1); + type = config.IsPitchLinear() ? ImageType::Linear : ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + 1; + break; + case TextureType::Texture2DArray: + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + config.Depth(); + break; + case TextureType::TextureCubemap: + ASSERT(config.Depth() == 1); + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + 6; + break; + case TextureType::TextureCubeArray: + UNIMPLEMENTED_IF(config.load_store_hint != 0); + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + config.Depth() * 6; + break; + case TextureType::Texture3D: + ASSERT(config.BaseLayer() == 0); + type = ImageType::e3D; + size.width = config.Width(); + size.height = config.Height(); + size.depth = config.Depth(); + break; + case TextureType::Texture1DBuffer: + type = ImageType::Buffer; + size.width = config.Width(); + break; + default: + UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value())); + break; + } + if (type != ImageType::Linear) { + // FIXME: Call this without passing *this + layer_stride = CalculateLayerStride(*this); + maybe_unaligned_layer_stride = CalculateLayerSize(*this); + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept { + const auto& rt = regs.rt[index]; + format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(rt.format); + if (rt.tile_mode.is_pitch_linear) { + ASSERT(rt.tile_mode.is_3d == 0); + type = ImageType::Linear; + pitch = rt.width; + size = Extent3D{ + .width = pitch / BytesPerBlock(format), + .height = rt.height, + .depth = 1, + }; + return; + } + size.width = rt.width; + size.height = rt.height; + layer_stride = rt.layer_stride * 4; + maybe_unaligned_layer_stride = layer_stride; + num_samples = NumSamples(regs.multisample_mode); + block = Extent3D{ + .width = rt.tile_mode.block_width, + .height = rt.tile_mode.block_height, + .depth = rt.tile_mode.block_depth, + }; + if (rt.tile_mode.is_3d) { + type = ImageType::e3D; + size.depth = rt.depth; + } else { + type = ImageType::e2D; + resources.layers = rt.depth; + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { + format = VideoCore::Surface::PixelFormatFromDepthFormat(regs.zeta.format); + size.width = regs.zeta_width; + size.height = regs.zeta_height; + resources.levels = 1; + layer_stride = regs.zeta.layer_stride * 4; + maybe_unaligned_layer_stride = layer_stride; + num_samples = NumSamples(regs.multisample_mode); + block = Extent3D{ + .width = regs.zeta.tile_mode.block_width, + .height = regs.zeta.tile_mode.block_height, + .depth = regs.zeta.tile_mode.block_depth, + }; + if (regs.zeta.tile_mode.is_pitch_linear) { + ASSERT(regs.zeta.tile_mode.is_3d == 0); + type = ImageType::Linear; + pitch = size.width * BytesPerBlock(format); + } else if (regs.zeta.tile_mode.is_3d) { + ASSERT(regs.zeta.tile_mode.is_pitch_linear == 0); + type = ImageType::e3D; + size.depth = regs.zeta_depth; + } else { + type = ImageType::e2D; + resources.layers = regs.zeta_depth; + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { + UNIMPLEMENTED_IF_MSG(config.layer != 0, "Surface layer is not zero"); + format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(config.format); + if (config.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch) { + type = ImageType::Linear; + size = Extent3D{ + .width = config.pitch / VideoCore::Surface::BytesPerBlock(format), + .height = config.height, + .depth = 1, + }; + pitch = config.pitch; + } else { + type = config.block_depth > 0 ? ImageType::e3D : ImageType::e2D; + block = Extent3D{ + .width = config.block_width, + .height = config.block_height, + .depth = config.block_depth, + }; + // 3D blits with more than once slice are not implemented for now + // Render to individual slices + size = Extent3D{ + .width = config.width, + .height = config.height, + .depth = 1, + }; + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h new file mode 100644 index 000000000..5049fc36e --- /dev/null +++ b/src/video_core/texture_cache/image_info.h @@ -0,0 +1,38 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/engines/fermi_2d.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +struct ImageInfo { + explicit ImageInfo() = default; + explicit ImageInfo(const TICEntry& config) noexcept; + explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; + explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; + explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept; + + PixelFormat format = PixelFormat::Invalid; + ImageType type = ImageType::e1D; + SubresourceExtent resources; + Extent3D size{1, 1, 1}; + union { + Extent3D block{0, 0, 0}; + u32 pitch; + }; + u32 layer_stride = 0; + u32 maybe_unaligned_layer_stride = 0; + u32 num_samples = 1; + u32 tile_width_spacing = 0; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp new file mode 100644 index 000000000..18f72e508 --- /dev/null +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -0,0 +1,41 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> + +#include "common/assert.h" +#include "core/settings.h" +#include "video_core/compatible_formats.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, + ImageId image_id_) + : image_id{image_id_}, format{info.format}, type{info.type}, range{info.range}, + size{ + .width = std::max(image_info.size.width >> range.base.level, 1u), + .height = std::max(image_info.size.height >> range.base.level, 1u), + .depth = std::max(image_info.size.depth >> range.base.level, 1u), + } { + ASSERT_MSG(VideoCore::Surface::IsViewCompatible(image_info.format, info.format, false), + "Image view format {} is incompatible with image format {}", info.format, + image_info.format); + const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + if (image_info.type == ImageType::Linear && is_async) { + flags |= ImageViewFlagBits::PreemtiveDownload; + } + if (image_info.type == ImageType::e3D && info.type != ImageViewType::e3D) { + flags |= ImageViewFlagBits::Slice; + } +} + +ImageViewBase::ImageViewBase(const NullImageParams&) {} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_base.h b/src/video_core/texture_cache/image_view_base.h new file mode 100644 index 000000000..73954167e --- /dev/null +++ b/src/video_core/texture_cache/image_view_base.h @@ -0,0 +1,47 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +using VideoCore::Surface::PixelFormat; + +struct ImageViewInfo; +struct ImageInfo; + +struct NullImageParams {}; + +enum class ImageViewFlagBits : u16 { + PreemtiveDownload = 1 << 0, + Strong = 1 << 1, + Slice = 1 << 2, +}; +DECLARE_ENUM_FLAG_OPERATORS(ImageViewFlagBits) + +struct ImageViewBase { + explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, + ImageId image_id); + explicit ImageViewBase(const NullImageParams&); + + [[nodiscard]] bool IsBuffer() const noexcept { + return type == ImageViewType::Buffer; + } + + ImageId image_id{}; + PixelFormat format{}; + ImageViewType type{}; + SubresourceRange range; + Extent3D size{0, 0, 0}; + ImageViewFlagBits flags{}; + + u64 invalidation_tick = 0; + u64 modification_tick = 0; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp new file mode 100644 index 000000000..faf5b151f --- /dev/null +++ b/src/video_core/texture_cache/image_view_info.cpp @@ -0,0 +1,88 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <limits> + +#include "common/assert.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +namespace { + +constexpr u8 RENDER_TARGET_SWIZZLE = std::numeric_limits<u8>::max(); + +[[nodiscard]] u8 CastSwizzle(SwizzleSource source) { + const u8 casted = static_cast<u8>(source); + ASSERT(static_cast<SwizzleSource>(casted) == source); + return casted; +} + +} // Anonymous namespace + +ImageViewInfo::ImageViewInfo(const TICEntry& config, s32 base_layer) noexcept + : format{PixelFormatFromTIC(config)}, x_source{CastSwizzle(config.x_source)}, + y_source{CastSwizzle(config.y_source)}, z_source{CastSwizzle(config.z_source)}, + w_source{CastSwizzle(config.w_source)} { + range.base = SubresourceBase{ + .level = static_cast<s32>(config.res_min_mip_level), + .layer = base_layer, + }; + range.extent.levels = config.res_max_mip_level - config.res_min_mip_level + 1; + + switch (config.texture_type) { + case TextureType::Texture1D: + ASSERT(config.Height() == 1); + ASSERT(config.Depth() == 1); + type = ImageViewType::e1D; + break; + case TextureType::Texture2D: + case TextureType::Texture2DNoMipmap: + ASSERT(config.Depth() == 1); + type = config.normalized_coords ? ImageViewType::e2D : ImageViewType::Rect; + break; + case TextureType::Texture3D: + type = ImageViewType::e3D; + break; + case TextureType::TextureCubemap: + ASSERT(config.Depth() == 1); + type = ImageViewType::Cube; + range.extent.layers = 6; + break; + case TextureType::Texture1DArray: + type = ImageViewType::e1DArray; + range.extent.layers = config.Depth(); + break; + case TextureType::Texture2DArray: + type = ImageViewType::e2DArray; + range.extent.layers = config.Depth(); + break; + case TextureType::Texture1DBuffer: + type = ImageViewType::Buffer; + break; + case TextureType::TextureCubeArray: + type = ImageViewType::CubeArray; + range.extent.layers = config.Depth() * 6; + break; + default: + UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value())); + break; + } +} + +ImageViewInfo::ImageViewInfo(ImageViewType type_, PixelFormat format_, + SubresourceRange range_) noexcept + : type{type_}, format{format_}, range{range_}, x_source{RENDER_TARGET_SWIZZLE}, + y_source{RENDER_TARGET_SWIZZLE}, z_source{RENDER_TARGET_SWIZZLE}, + w_source{RENDER_TARGET_SWIZZLE} {} + +bool ImageViewInfo::IsRenderTarget() const noexcept { + return x_source == RENDER_TARGET_SWIZZLE && y_source == RENDER_TARGET_SWIZZLE && + z_source == RENDER_TARGET_SWIZZLE && w_source == RENDER_TARGET_SWIZZLE; +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_info.h b/src/video_core/texture_cache/image_view_info.h new file mode 100644 index 000000000..0c1f99117 --- /dev/null +++ b/src/video_core/texture_cache/image_view_info.h @@ -0,0 +1,50 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <type_traits> + +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +/// Properties used to determine a image view +struct ImageViewInfo { + explicit ImageViewInfo() noexcept = default; + explicit ImageViewInfo(const TICEntry& config, s32 base_layer) noexcept; + explicit ImageViewInfo(ImageViewType type, PixelFormat format, + SubresourceRange range = {}) noexcept; + + auto operator<=>(const ImageViewInfo&) const noexcept = default; + + [[nodiscard]] bool IsRenderTarget() const noexcept; + + [[nodiscard]] std::array<SwizzleSource, 4> Swizzle() const noexcept { + return std::array{ + static_cast<SwizzleSource>(x_source), + static_cast<SwizzleSource>(y_source), + static_cast<SwizzleSource>(z_source), + static_cast<SwizzleSource>(w_source), + }; + } + + ImageViewType type{}; + PixelFormat format{}; + SubresourceRange range; + u8 x_source = static_cast<u8>(SwizzleSource::R); + u8 y_source = static_cast<u8>(SwizzleSource::G); + u8 z_source = static_cast<u8>(SwizzleSource::B); + u8 w_source = static_cast<u8>(SwizzleSource::A); +}; +static_assert(std::has_unique_object_representations_v<ImageViewInfo>); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/render_targets.h b/src/video_core/texture_cache/render_targets.h new file mode 100644 index 000000000..9b9544b07 --- /dev/null +++ b/src/video_core/texture_cache/render_targets.h @@ -0,0 +1,51 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <span> +#include <utility> + +#include "common/bit_cast.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +/// Framebuffer properties used to lookup a framebuffer +struct RenderTargets { + constexpr auto operator<=>(const RenderTargets&) const noexcept = default; + + constexpr bool Contains(std::span<const ImageViewId> elements) const noexcept { + const auto contains = [elements](ImageViewId item) { + return std::ranges::find(elements, item) != elements.end(); + }; + return std::ranges::any_of(color_buffer_ids, contains) || contains(depth_buffer_id); + } + + std::array<ImageViewId, NUM_RT> color_buffer_ids; + ImageViewId depth_buffer_id; + std::array<u8, NUM_RT> draw_buffers{}; + Extent2D size; +}; + +} // namespace VideoCommon + +namespace std { + +template <> +struct hash<VideoCommon::RenderTargets> { + size_t operator()(const VideoCommon::RenderTargets& rt) const noexcept { + using VideoCommon::ImageViewId; + size_t value = std::hash<ImageViewId>{}(rt.depth_buffer_id); + for (const ImageViewId color_buffer_id : rt.color_buffer_ids) { + value ^= std::hash<ImageViewId>{}(color_buffer_id); + } + value ^= Common::BitCast<u64>(rt.draw_buffers); + value ^= Common::BitCast<u64>(rt.size); + return value; + } +}; + +} // namespace std diff --git a/src/video_core/texture_cache/samples_helper.h b/src/video_core/texture_cache/samples_helper.h new file mode 100644 index 000000000..04539a43c --- /dev/null +++ b/src/video_core/texture_cache/samples_helper.h @@ -0,0 +1,55 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <utility> + +#include "common/assert.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +[[nodiscard]] inline std::pair<int, int> SamplesLog2(int num_samples) { + switch (num_samples) { + case 1: + return {0, 0}; + case 2: + return {1, 0}; + case 4: + return {1, 1}; + case 8: + return {2, 1}; + case 16: + return {2, 2}; + } + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return {1, 1}; +} + +[[nodiscard]] inline int NumSamples(Tegra::Texture::MsaaMode msaa_mode) { + using Tegra::Texture::MsaaMode; + switch (msaa_mode) { + case MsaaMode::Msaa1x1: + return 1; + case MsaaMode::Msaa2x1: + case MsaaMode::Msaa2x1_D3D: + return 2; + case MsaaMode::Msaa2x2: + case MsaaMode::Msaa2x2_VC4: + case MsaaMode::Msaa2x2_VC12: + return 4; + case MsaaMode::Msaa4x2: + case MsaaMode::Msaa4x2_D3D: + case MsaaMode::Msaa4x2_VC8: + case MsaaMode::Msaa4x2_VC24: + return 8; + case MsaaMode::Msaa4x4: + return 16; + } + UNREACHABLE_MSG("Invalid MSAA mode={}", static_cast<int>(msaa_mode)); + return 1; +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h new file mode 100644 index 000000000..eae3be6ea --- /dev/null +++ b/src/video_core/texture_cache/slot_vector.h @@ -0,0 +1,156 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <concepts> +#include <numeric> +#include <type_traits> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" + +namespace VideoCommon { + +struct SlotId { + static constexpr u32 INVALID_INDEX = std::numeric_limits<u32>::max(); + + constexpr auto operator<=>(const SlotId&) const noexcept = default; + + constexpr explicit operator bool() const noexcept { + return index != INVALID_INDEX; + } + + u32 index = INVALID_INDEX; +}; + +template <class T> +requires std::is_nothrow_move_assignable_v<T>&& + std::is_nothrow_move_constructible_v<T> class SlotVector { +public: + ~SlotVector() noexcept { + size_t index = 0; + for (u64 bits : stored_bitset) { + for (size_t bit = 0; bits; ++bit, bits >>= 1) { + if ((bits & 1) != 0) { + values[index + bit].object.~T(); + } + } + index += 64; + } + delete[] values; + } + + [[nodiscard]] T& operator[](SlotId id) noexcept { + ValidateIndex(id); + return values[id.index].object; + } + + [[nodiscard]] const T& operator[](SlotId id) const noexcept { + ValidateIndex(id); + return values[id.index].object; + } + + template <typename... Args> + [[nodiscard]] SlotId insert(Args&&... args) noexcept { + const u32 index = FreeValueIndex(); + new (&values[index].object) T(std::forward<Args>(args)...); + SetStorageBit(index); + + return SlotId{index}; + } + + void erase(SlotId id) noexcept { + values[id.index].object.~T(); + free_list.push_back(id.index); + ResetStorageBit(id.index); + } + +private: + struct NonTrivialDummy { + NonTrivialDummy() noexcept {} + }; + + union Entry { + Entry() noexcept : dummy{} {} + ~Entry() noexcept {} + + NonTrivialDummy dummy; + T object; + }; + + void SetStorageBit(u32 index) noexcept { + stored_bitset[index / 64] |= u64(1) << (index % 64); + } + + void ResetStorageBit(u32 index) noexcept { + stored_bitset[index / 64] &= ~(u64(1) << (index % 64)); + } + + bool ReadStorageBit(u32 index) noexcept { + return ((stored_bitset[index / 64] >> (index % 64)) & 1) != 0; + } + + void ValidateIndex(SlotId id) const noexcept { + DEBUG_ASSERT(id); + DEBUG_ASSERT(id.index / 64 < stored_bitset.size()); + DEBUG_ASSERT(((stored_bitset[id.index / 64] >> (id.index % 64)) & 1) != 0); + } + + [[nodiscard]] u32 FreeValueIndex() noexcept { + if (free_list.empty()) { + Reserve(values_capacity ? (values_capacity << 1) : 1); + } + const u32 free_index = free_list.back(); + free_list.pop_back(); + return free_index; + } + + void Reserve(size_t new_capacity) noexcept { + Entry* const new_values = new Entry[new_capacity]; + size_t index = 0; + for (u64 bits : stored_bitset) { + for (size_t bit = 0; bits; ++bit, bits >>= 1) { + const size_t i = index + bit; + if ((bits & 1) == 0) { + continue; + } + T& old_value = values[i].object; + new (&new_values[i].object) T(std::move(old_value)); + old_value.~T(); + } + index += 64; + } + + stored_bitset.resize((new_capacity + 63) / 64); + + const size_t old_free_size = free_list.size(); + free_list.resize(old_free_size + (new_capacity - values_capacity)); + std::iota(free_list.begin() + old_free_size, free_list.end(), + static_cast<u32>(values_capacity)); + + delete[] values; + values = new_values; + values_capacity = new_capacity; + } + + Entry* values = nullptr; + size_t values_capacity = 0; + size_t values_size = 0; + + std::vector<u64> stored_bitset; + std::vector<u32> free_list; +}; + +} // namespace VideoCommon + +template <> +struct std::hash<VideoCommon::SlotId> { + size_t operator()(const VideoCommon::SlotId& id) const noexcept { + return std::hash<u32>{}(id.index); + } +}; diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp deleted file mode 100644 index dfcf36e0b..000000000 --- a/src/video_core/texture_cache/surface_base.cpp +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/algorithm.h" -#include "common/assert.h" -#include "common/common_types.h" -#include "common/microprofile.h" -#include "video_core/memory_manager.h" -#include "video_core/texture_cache/surface_base.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/textures/convert.h" - -namespace VideoCommon { - -MICROPROFILE_DEFINE(GPU_Load_Texture, "GPU", "Texture Load", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(GPU_Flush_Texture, "GPU", "Texture Flush", MP_RGB(128, 192, 128)); - -using Tegra::Texture::ConvertFromGuestToHost; -using VideoCore::MortonSwizzleMode; -using VideoCore::Surface::IsPixelFormatASTC; -using VideoCore::Surface::PixelFormat; - -StagingCache::StagingCache() = default; - -StagingCache::~StagingCache() = default; - -SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : params{params}, gpu_addr{gpu_addr}, mipmap_sizes(params.num_levels), - mipmap_offsets(params.num_levels) { - is_converted = IsPixelFormatASTC(params.pixel_format) && !is_astc_supported; - host_memory_size = params.GetHostSizeInBytes(is_converted); - - std::size_t offset = 0; - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t mipmap_size{params.GetGuestMipmapSize(level)}; - mipmap_sizes[level] = mipmap_size; - mipmap_offsets[level] = offset; - offset += mipmap_size; - } - layer_size = offset; - if (params.is_layered) { - if (params.is_tiled) { - layer_size = - SurfaceParams::AlignLayered(layer_size, params.block_height, params.block_depth); - } - guest_memory_size = layer_size * params.depth; - } else { - guest_memory_size = layer_size; - } -} - -MatchTopologyResult SurfaceBaseImpl::MatchesTopology(const SurfaceParams& rhs) const { - const u32 src_bpp{params.GetBytesPerPixel()}; - const u32 dst_bpp{rhs.GetBytesPerPixel()}; - const bool ib1 = params.IsBuffer(); - const bool ib2 = rhs.IsBuffer(); - if (std::tie(src_bpp, params.is_tiled, ib1) == std::tie(dst_bpp, rhs.is_tiled, ib2)) { - const bool cb1 = params.IsCompressed(); - const bool cb2 = rhs.IsCompressed(); - if (cb1 == cb2) { - return MatchTopologyResult::FullMatch; - } - return MatchTopologyResult::CompressUnmatch; - } - return MatchTopologyResult::None; -} - -MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs) const { - // Buffer surface Check - if (params.IsBuffer()) { - const std::size_t wd1 = params.width * params.GetBytesPerPixel(); - const std::size_t wd2 = rhs.width * rhs.GetBytesPerPixel(); - if (wd1 == wd2) { - return MatchStructureResult::FullMatch; - } - return MatchStructureResult::None; - } - - // Linear Surface check - if (!params.is_tiled) { - if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) { - if (params.width == rhs.width) { - return MatchStructureResult::FullMatch; - } else { - return MatchStructureResult::SemiMatch; - } - } - return MatchStructureResult::None; - } - - // Tiled Surface check - if (std::tie(params.depth, params.block_width, params.block_height, params.block_depth, - params.tile_width_spacing, params.num_levels) == - std::tie(rhs.depth, rhs.block_width, rhs.block_height, rhs.block_depth, - rhs.tile_width_spacing, rhs.num_levels)) { - if (std::tie(params.width, params.height) == std::tie(rhs.width, rhs.height)) { - return MatchStructureResult::FullMatch; - } - const u32 ws = SurfaceParams::ConvertWidth(rhs.GetBlockAlignedWidth(), params.pixel_format, - rhs.pixel_format); - const u32 hs = - SurfaceParams::ConvertHeight(rhs.height, params.pixel_format, rhs.pixel_format); - const u32 w1 = params.GetBlockAlignedWidth(); - if (std::tie(w1, params.height) == std::tie(ws, hs)) { - return MatchStructureResult::SemiMatch; - } - } - return MatchStructureResult::None; -} - -std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap( - const GPUVAddr candidate_gpu_addr) const { - if (gpu_addr == candidate_gpu_addr) { - return {{0, 0}}; - } - if (candidate_gpu_addr < gpu_addr) { - return {}; - } - const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)}; - const auto layer{static_cast<u32>(relative_address / layer_size)}; - if (layer >= params.depth) { - return {}; - } - const GPUVAddr mipmap_address = relative_address - layer_size * layer; - const auto mipmap_it = - Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address); - if (mipmap_it == mipmap_offsets.end()) { - return {}; - } - const auto level{static_cast<u32>(std::distance(mipmap_offsets.begin(), mipmap_it))}; - return std::make_pair(layer, level); -} - -std::vector<CopyParams> SurfaceBaseImpl::BreakDownLayered(const SurfaceParams& in_params) const { - const u32 layers{params.depth}; - const u32 mipmaps{params.num_levels}; - std::vector<CopyParams> result; - result.reserve(static_cast<std::size_t>(layers) * static_cast<std::size_t>(mipmaps)); - - for (u32 layer = 0; layer < layers; layer++) { - for (u32 level = 0; level < mipmaps; level++) { - const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); - const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - result.emplace_back(0, 0, layer, 0, 0, layer, level, level, width, height, 1); - } - } - return result; -} - -std::vector<CopyParams> SurfaceBaseImpl::BreakDownNonLayered(const SurfaceParams& in_params) const { - const u32 mipmaps{params.num_levels}; - std::vector<CopyParams> result; - result.reserve(mipmaps); - - for (u32 level = 0; level < mipmaps; level++) { - const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); - const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - const u32 depth{std::min(params.GetMipDepth(level), in_params.GetMipDepth(level))}; - result.emplace_back(width, height, depth, level); - } - return result; -} - -void SurfaceBaseImpl::SwizzleFunc(MortonSwizzleMode mode, u8* memory, const SurfaceParams& params, - u8* buffer, u32 level) { - const u32 width{params.GetMipWidth(level)}; - const u32 height{params.GetMipHeight(level)}; - const u32 block_height{params.GetMipBlockHeight(level)}; - const u32 block_depth{params.GetMipBlockDepth(level)}; - - std::size_t guest_offset{mipmap_offsets[level]}; - if (params.is_layered) { - std::size_t host_offset = 0; - const std::size_t guest_stride = layer_size; - const std::size_t host_stride = params.GetHostLayerSize(level); - for (u32 layer = 0; layer < params.depth; ++layer) { - MortonSwizzle(mode, params.pixel_format, width, block_height, height, block_depth, 1, - params.tile_width_spacing, buffer + host_offset, memory + guest_offset); - guest_offset += guest_stride; - host_offset += host_stride; - } - } else { - MortonSwizzle(mode, params.pixel_format, width, block_height, height, block_depth, - params.GetMipDepth(level), params.tile_width_spacing, buffer, - memory + guest_offset); - } -} - -void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager, - StagingCache& staging_cache) { - MICROPROFILE_SCOPE(GPU_Load_Texture); - auto& staging_buffer = staging_cache.GetBuffer(0); - u8* host_ptr; - // Use an extra temporal buffer - auto& tmp_buffer = staging_cache.GetBuffer(1); - tmp_buffer.resize(guest_memory_size); - host_ptr = tmp_buffer.data(); - memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); - - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 0, "Block width is defined as {} on texture target {}", - params.block_width, static_cast<u32>(params.target)); - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t host_offset{params.GetHostMipmapLevelOffset(level, false)}; - SwizzleFunc(MortonSwizzleMode::MortonToLinear, host_ptr, params, - staging_buffer.data() + host_offset, level); - } - } else { - ASSERT_MSG(params.num_levels == 1, "Linear mipmap loading is not implemented"); - const u32 bpp{params.GetBytesPerPixel()}; - const u32 block_width{params.GetDefaultBlockWidth()}; - const u32 block_height{params.GetDefaultBlockHeight()}; - const u32 width{(params.width + block_width - 1) / block_width}; - const u32 height{(params.height + block_height - 1) / block_height}; - const u32 copy_size{width * bpp}; - if (params.pitch == copy_size) { - std::memcpy(staging_buffer.data(), host_ptr, params.GetHostSizeInBytes(false)); - } else { - const u8* start{host_ptr}; - u8* write_to{staging_buffer.data()}; - for (u32 h = height; h > 0; --h) { - std::memcpy(write_to, start, copy_size); - start += params.pitch; - write_to += copy_size; - } - } - } - - if (!is_converted && params.pixel_format != PixelFormat::S8_UINT_D24_UNORM) { - return; - } - - for (u32 level = params.num_levels; level--;) { - const std::size_t in_host_offset{params.GetHostMipmapLevelOffset(level, false)}; - const std::size_t out_host_offset{params.GetHostMipmapLevelOffset(level, is_converted)}; - u8* const in_buffer = staging_buffer.data() + in_host_offset; - u8* const out_buffer = staging_buffer.data() + out_host_offset; - ConvertFromGuestToHost(in_buffer, out_buffer, params.pixel_format, - params.GetMipWidth(level), params.GetMipHeight(level), - params.GetMipDepth(level), true, true); - } -} - -void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, - StagingCache& staging_cache) { - MICROPROFILE_SCOPE(GPU_Flush_Texture); - auto& staging_buffer = staging_cache.GetBuffer(0); - u8* host_ptr; - - // Use an extra temporal buffer - auto& tmp_buffer = staging_cache.GetBuffer(1); - tmp_buffer.resize(guest_memory_size); - host_ptr = tmp_buffer.data(); - - if (params.target == SurfaceTarget::Texture3D) { - // Special case for 3D texture segments - memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); - } - - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 0, "Block width is defined as {}", params.block_width); - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t host_offset{params.GetHostMipmapLevelOffset(level, false)}; - SwizzleFunc(MortonSwizzleMode::LinearToMorton, host_ptr, params, - staging_buffer.data() + host_offset, level); - } - } else if (params.IsBuffer()) { - // Buffers don't have pitch or any fancy layout property. We can just memcpy them to guest - // memory. - std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size); - } else { - ASSERT(params.target == SurfaceTarget::Texture2D); - ASSERT(params.num_levels == 1); - - const u32 bpp{params.GetBytesPerPixel()}; - const u32 copy_size{params.width * bpp}; - if (params.pitch == copy_size) { - std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size); - } else { - u8* start{host_ptr}; - const u8* read_to{staging_buffer.data()}; - for (u32 h = params.height; h > 0; --h) { - std::memcpy(start, read_to, copy_size); - start += params.pitch; - read_to += copy_size; - } - } - } - memory_manager.WriteBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h deleted file mode 100644 index 173f2edba..000000000 --- a/src/video_core/texture_cache/surface_base.h +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <optional> -#include <tuple> -#include <unordered_map> -#include <vector> - -#include "common/common_types.h" -#include "video_core/gpu.h" -#include "video_core/morton.h" -#include "video_core/texture_cache/copy_params.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/texture_cache/surface_view.h" - -namespace Tegra { -class MemoryManager; -} - -namespace VideoCommon { - -using VideoCore::MortonSwizzleMode; -using VideoCore::Surface::SurfaceTarget; - -enum class MatchStructureResult : u32 { - FullMatch = 0, - SemiMatch = 1, - None = 2, -}; - -enum class MatchTopologyResult : u32 { - FullMatch = 0, - CompressUnmatch = 1, - None = 2, -}; - -class StagingCache { -public: - explicit StagingCache(); - ~StagingCache(); - - std::vector<u8>& GetBuffer(std::size_t index) { - return staging_buffer[index]; - } - - const std::vector<u8>& GetBuffer(std::size_t index) const { - return staging_buffer[index]; - } - - void SetSize(std::size_t size) { - staging_buffer.resize(size); - } - -private: - std::vector<std::vector<u8>> staging_buffer; -}; - -class SurfaceBaseImpl { -public: - void LoadBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache); - - void FlushBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache); - - GPUVAddr GetGpuAddr() const { - return gpu_addr; - } - - bool Overlaps(const VAddr start, const VAddr end) const { - return (cpu_addr < end) && (cpu_addr_end > start); - } - - bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) const { - const GPUVAddr gpu_addr_end = gpu_addr + guest_memory_size; - return gpu_addr <= other_start && other_end <= gpu_addr_end; - } - - // Use only when recycling a surface - void SetGpuAddr(const GPUVAddr new_addr) { - gpu_addr = new_addr; - } - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - VAddr GetCpuAddrEnd() const { - return cpu_addr_end; - } - - void SetCpuAddr(const VAddr new_addr) { - cpu_addr = new_addr; - cpu_addr_end = new_addr + guest_memory_size; - } - - const SurfaceParams& GetSurfaceParams() const { - return params; - } - - std::size_t GetSizeInBytes() const { - return guest_memory_size; - } - - std::size_t GetHostSizeInBytes() const { - return host_memory_size; - } - - std::size_t GetMipmapSize(const u32 level) const { - return mipmap_sizes[level]; - } - - bool IsLinear() const { - return !params.is_tiled; - } - - bool IsConverted() const { - return is_converted; - } - - bool MatchFormat(VideoCore::Surface::PixelFormat pixel_format) const { - return params.pixel_format == pixel_format; - } - - VideoCore::Surface::PixelFormat GetFormat() const { - return params.pixel_format; - } - - bool MatchTarget(VideoCore::Surface::SurfaceTarget target) const { - return params.target == target; - } - - MatchTopologyResult MatchesTopology(const SurfaceParams& rhs) const; - - MatchStructureResult MatchesStructure(const SurfaceParams& rhs) const; - - bool MatchesSubTexture(const SurfaceParams& rhs, const GPUVAddr other_gpu_addr) const { - return std::tie(gpu_addr, params.target, params.num_levels) == - std::tie(other_gpu_addr, rhs.target, rhs.num_levels) && - params.target == SurfaceTarget::Texture2D && params.num_levels == 1; - } - - std::optional<std::pair<u32, u32>> GetLayerMipmap(const GPUVAddr candidate_gpu_addr) const; - - std::vector<CopyParams> BreakDown(const SurfaceParams& in_params) const { - return params.is_layered ? BreakDownLayered(in_params) : BreakDownNonLayered(in_params); - } - -protected: - explicit SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported); - ~SurfaceBaseImpl() = default; - - virtual void DecorateSurfaceName() = 0; - - const SurfaceParams params; - std::size_t layer_size; - std::size_t guest_memory_size; - std::size_t host_memory_size; - GPUVAddr gpu_addr{}; - VAddr cpu_addr{}; - VAddr cpu_addr_end{}; - bool is_converted{}; - - std::vector<std::size_t> mipmap_sizes; - std::vector<std::size_t> mipmap_offsets; - -private: - void SwizzleFunc(MortonSwizzleMode mode, u8* memory, const SurfaceParams& params, u8* buffer, - u32 level); - - std::vector<CopyParams> BreakDownLayered(const SurfaceParams& in_params) const; - - std::vector<CopyParams> BreakDownNonLayered(const SurfaceParams& in_params) const; -}; - -template <typename TView> -class SurfaceBase : public SurfaceBaseImpl { -public: - virtual void UploadTexture(const std::vector<u8>& staging_buffer) = 0; - - virtual void DownloadTexture(std::vector<u8>& staging_buffer) = 0; - - void MarkAsModified(bool is_modified_, u64 tick) { - is_modified = is_modified_ || is_target; - modification_tick = tick; - } - - void MarkAsRenderTarget(bool is_target_, u32 index_) { - is_target = is_target_; - index = index_; - } - - void SetMemoryMarked(bool is_memory_marked_) { - is_memory_marked = is_memory_marked_; - } - - bool IsMemoryMarked() const { - return is_memory_marked; - } - - void SetSyncPending(bool is_sync_pending_) { - is_sync_pending = is_sync_pending_; - } - - bool IsSyncPending() const { - return is_sync_pending; - } - - void MarkAsPicked(bool is_picked_) { - is_picked = is_picked_; - } - - bool IsModified() const { - return is_modified; - } - - bool IsProtected() const { - // Only 3D slices are to be protected - return is_target && params.target == SurfaceTarget::Texture3D; - } - - bool IsRenderTarget() const { - return is_target; - } - - u32 GetRenderTarget() const { - return index; - } - - bool IsRegistered() const { - return is_registered; - } - - bool IsPicked() const { - return is_picked; - } - - void MarkAsRegistered(bool is_reg) { - is_registered = is_reg; - } - - u64 GetModificationTick() const { - return modification_tick; - } - - TView EmplaceOverview(const SurfaceParams& overview_params) { - const u32 num_layers{(params.is_layered && !overview_params.is_layered) ? 1 : params.depth}; - return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); - } - - TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { - return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, - base_level, num_levels)); - } - - std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params, - const GPUVAddr view_addr, - const std::size_t candidate_size, const u32 mipmap, - const u32 layer) { - const auto layer_mipmap{GetLayerMipmap(view_addr + candidate_size)}; - if (!layer_mipmap) { - return {}; - } - const auto [end_layer, end_mipmap] = *layer_mipmap; - if (layer != end_layer) { - if (mipmap == 0 && end_mipmap == 0) { - return GetView(ViewParams(view_params.target, layer, end_layer - layer, 0, 1)); - } - return {}; - } else { - return GetView(ViewParams(view_params.target, layer, 1, mipmap, end_mipmap - mipmap)); - } - } - - std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, - const std::size_t candidate_size) { - if (params.target == SurfaceTarget::Texture3D || - view_params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered)) { - return {}; - } - const auto layer_mipmap{GetLayerMipmap(view_addr)}; - if (!layer_mipmap) { - return {}; - } - const auto [layer, mipmap] = *layer_mipmap; - if (GetMipmapSize(mipmap) != candidate_size) { - return EmplaceIrregularView(view_params, view_addr, candidate_size, mipmap, layer); - } - return GetView(ViewParams(view_params.target, layer, 1, mipmap, 1)); - } - - TView GetMainView() const { - return main_view; - } - -protected: - explicit SurfaceBase(const GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : SurfaceBaseImpl(gpu_addr, params, is_astc_supported) {} - - ~SurfaceBase() = default; - - virtual TView CreateView(const ViewParams& view_key) = 0; - - TView main_view; - std::unordered_map<ViewParams, TView> views; - -private: - TView GetView(const ViewParams& key) { - const auto [entry, is_cache_miss] = views.try_emplace(key); - auto& view{entry->second}; - if (is_cache_miss) { - view = CreateView(key); - } - return view; - } - - static constexpr u32 NO_RT = 0xFFFFFFFF; - - bool is_modified{}; - bool is_target{}; - bool is_registered{}; - bool is_picked{}; - bool is_memory_marked{}; - bool is_sync_pending{}; - u32 index{NO_RT}; - u64 modification_tick{}; -}; - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp deleted file mode 100644 index e614a92df..000000000 --- a/src/video_core/texture_cache/surface_params.cpp +++ /dev/null @@ -1,445 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <string> -#include <tuple> - -#include "common/alignment.h" -#include "common/bit_util.h" -#include "core/core.h" -#include "video_core/engines/shader_bytecode.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/format_lookup_table.h" -#include "video_core/texture_cache/surface_params.h" - -namespace VideoCommon { - -using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; -using VideoCore::Surface::SurfaceTarget; -using VideoCore::Surface::SurfaceTargetFromTextureType; -using VideoCore::Surface::SurfaceType; - -namespace { - -SurfaceTarget TextureTypeToSurfaceTarget(Tegra::Shader::TextureType type, bool is_array) { - switch (type) { - case Tegra::Shader::TextureType::Texture1D: - return is_array ? SurfaceTarget::Texture1DArray : SurfaceTarget::Texture1D; - case Tegra::Shader::TextureType::Texture2D: - return is_array ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; - case Tegra::Shader::TextureType::Texture3D: - ASSERT(!is_array); - return SurfaceTarget::Texture3D; - case Tegra::Shader::TextureType::TextureCube: - return is_array ? SurfaceTarget::TextureCubeArray : SurfaceTarget::TextureCubemap; - default: - UNREACHABLE(); - return SurfaceTarget::Texture2D; - } -} - -SurfaceTarget ImageTypeToSurfaceTarget(Tegra::Shader::ImageType type) { - switch (type) { - case Tegra::Shader::ImageType::Texture1D: - return SurfaceTarget::Texture1D; - case Tegra::Shader::ImageType::TextureBuffer: - return SurfaceTarget::TextureBuffer; - case Tegra::Shader::ImageType::Texture1DArray: - return SurfaceTarget::Texture1DArray; - case Tegra::Shader::ImageType::Texture2D: - return SurfaceTarget::Texture2D; - case Tegra::Shader::ImageType::Texture2DArray: - return SurfaceTarget::Texture2DArray; - case Tegra::Shader::ImageType::Texture3D: - return SurfaceTarget::Texture3D; - default: - UNREACHABLE(); - return SurfaceTarget::Texture2D; - } -} - -constexpr u32 GetMipmapSize(bool uncompressed, u32 mip_size, u32 tile) { - return uncompressed ? mip_size : std::max(1U, (mip_size + tile - 1) / tile); -} - -} // Anonymous namespace - -SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry) { - SurfaceParams params; - params.is_tiled = tic.IsTiled(); - params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0; - params.block_height = params.is_tiled ? tic.BlockHeight() : 0; - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; - params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; - params.pixel_format = lookup_table.GetPixelFormat( - tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); - params.type = GetFormatType(params.pixel_format); - if (entry.is_shadow && params.type == SurfaceType::ColorTexture) { - switch (params.pixel_format) { - case PixelFormat::R16_UNORM: - case PixelFormat::R16_FLOAT: - params.pixel_format = PixelFormat::D16_UNORM; - break; - case PixelFormat::R32_FLOAT: - params.pixel_format = PixelFormat::D32_FLOAT; - break; - default: - UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}", - static_cast<u32>(params.pixel_format)); - } - params.type = GetFormatType(params.pixel_format); - } - // TODO: on 1DBuffer we should use the tic info. - if (tic.IsBuffer()) { - params.target = SurfaceTarget::TextureBuffer; - params.width = tic.Width(); - params.pitch = params.width * params.GetBytesPerPixel(); - params.height = 1; - params.depth = 1; - params.num_levels = 1; - params.emulated_levels = 1; - params.is_layered = false; - } else { - params.target = TextureTypeToSurfaceTarget(entry.type, entry.is_array); - params.width = tic.Width(); - params.height = tic.Height(); - params.depth = tic.Depth(); - params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { - params.depth *= 6; - } - params.num_levels = tic.max_mip_level + 1; - params.emulated_levels = std::min(params.num_levels, params.MaxPossibleMipmap()); - params.is_layered = params.IsLayered(); - } - return params; -} - -SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry) { - SurfaceParams params; - params.is_tiled = tic.IsTiled(); - params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0; - params.block_height = params.is_tiled ? tic.BlockHeight() : 0; - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; - params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; - params.pixel_format = lookup_table.GetPixelFormat( - tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); - params.type = GetFormatType(params.pixel_format); - params.target = ImageTypeToSurfaceTarget(entry.type); - // TODO: on 1DBuffer we should use the tic info. - if (tic.IsBuffer()) { - params.target = SurfaceTarget::TextureBuffer; - params.width = tic.Width(); - params.pitch = params.width * params.GetBytesPerPixel(); - params.height = 1; - params.depth = 1; - params.num_levels = 1; - params.emulated_levels = 1; - params.is_layered = false; - } else { - params.width = tic.Width(); - params.height = tic.Height(); - params.depth = tic.Depth(); - params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { - params.depth *= 6; - } - params.num_levels = tic.max_mip_level + 1; - params.emulated_levels = std::min(params.num_levels, params.MaxPossibleMipmap()); - params.is_layered = params.IsLayered(); - } - return params; -} - -SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) { - const auto& regs = system.GPU().Maxwell3D().regs; - - const auto block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); - const bool is_layered = regs.zeta_layers > 1 && block_depth == 0; - const auto pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); - - return { - .is_tiled = regs.zeta.memory_layout.type == - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear, - .srgb_conversion = false, - .is_layered = is_layered, - .block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U), - .block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U), - .block_depth = block_depth, - .tile_width_spacing = 1, - .width = regs.zeta_width, - .height = regs.zeta_height, - .depth = is_layered ? regs.zeta_layers.Value() : 1U, - .pitch = 0, - .num_levels = 1, - .emulated_levels = 1, - .pixel_format = pixel_format, - .type = GetFormatType(pixel_format), - .target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D, - }; -} - -SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::size_t index) { - const auto& config{system.GPU().Maxwell3D().regs.rt[index]}; - SurfaceParams params; - params.is_tiled = - config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || - config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB; - params.block_width = config.memory_layout.block_width; - params.block_height = config.memory_layout.block_height; - params.block_depth = config.memory_layout.block_depth; - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); - params.type = GetFormatType(params.pixel_format); - if (params.is_tiled) { - params.pitch = 0; - params.width = config.width; - } else { - const u32 bpp = GetFormatBpp(params.pixel_format) / CHAR_BIT; - params.pitch = config.width; - params.width = params.pitch / bpp; - } - params.height = config.height; - params.num_levels = 1; - params.emulated_levels = 1; - - if (config.memory_layout.is_3d != 0) { - params.depth = config.layers.Value(); - params.is_layered = false; - params.target = SurfaceTarget::Texture3D; - } else if (config.layers > 1) { - params.depth = config.layers.Value(); - params.is_layered = true; - params.target = SurfaceTarget::Texture2DArray; - } else { - params.depth = 1; - params.is_layered = false; - params.target = SurfaceTarget::Texture2D; - } - return params; -} - -SurfaceParams SurfaceParams::CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config) { - const bool is_tiled = !config.linear; - const auto pixel_format = PixelFormatFromRenderTargetFormat(config.format); - - SurfaceParams params{ - .is_tiled = is_tiled, - .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || - config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB, - .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U, - .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U, - .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U, - .tile_width_spacing = 1, - .width = config.width, - .height = config.height, - .depth = 1, - .pitch = config.pitch, - .num_levels = 1, - .emulated_levels = 1, - .pixel_format = pixel_format, - .type = GetFormatType(pixel_format), - // TODO(Rodrigo): Try to guess texture arrays from parameters - .target = SurfaceTarget::Texture2D, - }; - - params.is_layered = params.IsLayered(); - return params; -} - -VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( - const VideoCommon::Shader::Sampler& entry) { - return TextureTypeToSurfaceTarget(entry.type, entry.is_array); -} - -VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( - const VideoCommon::Shader::Image& entry) { - return ImageTypeToSurfaceTarget(entry.type); -} - -bool SurfaceParams::IsLayered() const { - switch (target) { - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - return true; - default: - return false; - } -} - -// Auto block resizing algorithm from: -// https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_miptree.c -u32 SurfaceParams::GetMipBlockHeight(u32 level) const { - if (level == 0) { - return this->block_height; - } - - const u32 height_new{GetMipHeight(level)}; - const u32 default_block_height{GetDefaultBlockHeight()}; - const u32 blocks_in_y{(height_new + default_block_height - 1) / default_block_height}; - const u32 block_height_new = Common::Log2Ceil32(blocks_in_y); - return std::clamp(block_height_new, 3U, 7U) - 3U; -} - -u32 SurfaceParams::GetMipBlockDepth(u32 level) const { - if (level == 0) { - return this->block_depth; - } - if (is_layered) { - return 0; - } - - const u32 depth_new{GetMipDepth(level)}; - const u32 block_depth_new = Common::Log2Ceil32(depth_new); - if (block_depth_new > 4) { - return 5 - (GetMipBlockHeight(level) >= 2); - } - return block_depth_new; -} - -std::size_t SurfaceParams::GetGuestMipmapLevelOffset(u32 level) const { - std::size_t offset = 0; - for (u32 i = 0; i < level; i++) { - offset += GetInnerMipmapMemorySize(i, false, false); - } - return offset; -} - -std::size_t SurfaceParams::GetHostMipmapLevelOffset(u32 level, bool is_converted) const { - std::size_t offset = 0; - if (is_converted) { - for (u32 i = 0; i < level; ++i) { - offset += GetConvertedMipmapSize(i) * GetNumLayers(); - } - } else { - for (u32 i = 0; i < level; ++i) { - offset += GetInnerMipmapMemorySize(i, true, false) * GetNumLayers(); - } - } - return offset; -} - -std::size_t SurfaceParams::GetConvertedMipmapSize(u32 level) const { - constexpr std::size_t rgba8_bpp = 4ULL; - const std::size_t mip_width = GetMipWidth(level); - const std::size_t mip_height = GetMipHeight(level); - const std::size_t mip_depth = is_layered ? 1 : GetMipDepth(level); - return mip_width * mip_height * mip_depth * rgba8_bpp; -} - -std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) const { - std::size_t size = 0; - for (u32 level = 0; level < num_levels; ++level) { - size += GetInnerMipmapMemorySize(level, as_host_size, uncompressed); - } - if (is_tiled && is_layered) { - return Common::AlignBits(size, Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); - } - return size; -} - -std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size, - bool uncompressed) const { - const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())}; - const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())}; - const u32 depth{is_layered ? 1U : GetMipDepth(level)}; - if (is_tiled) { - return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height, - depth, GetMipBlockHeight(level), - GetMipBlockDepth(level)); - } else if (as_host_size || IsBuffer()) { - return GetBytesPerPixel() * width * height * depth; - } else { - // Linear Texture Case - return pitch * height * depth; - } -} - -bool SurfaceParams::operator==(const SurfaceParams& rhs) const { - return std::tie(is_tiled, block_width, block_height, block_depth, tile_width_spacing, width, - height, depth, pitch, num_levels, pixel_format, type, target) == - std::tie(rhs.is_tiled, rhs.block_width, rhs.block_height, rhs.block_depth, - rhs.tile_width_spacing, rhs.width, rhs.height, rhs.depth, rhs.pitch, - rhs.num_levels, rhs.pixel_format, rhs.type, rhs.target); -} - -std::string SurfaceParams::TargetName() const { - switch (target) { - case SurfaceTarget::Texture1D: - return "1D"; - case SurfaceTarget::TextureBuffer: - return "TexBuffer"; - case SurfaceTarget::Texture2D: - return "2D"; - case SurfaceTarget::Texture3D: - return "3D"; - case SurfaceTarget::Texture1DArray: - return "1DArray"; - case SurfaceTarget::Texture2DArray: - return "2DArray"; - case SurfaceTarget::TextureCubemap: - return "Cube"; - case SurfaceTarget::TextureCubeArray: - return "CubeArray"; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); - UNREACHABLE(); - return fmt::format("TUK({})", static_cast<u32>(target)); - } -} - -u32 SurfaceParams::GetBlockSize() const { - const u32 x = 64U << block_width; - const u32 y = 8U << block_height; - const u32 z = 1U << block_depth; - return x * y * z; -} - -std::pair<u32, u32> SurfaceParams::GetBlockXY() const { - const u32 x_pixels = 64U / GetBytesPerPixel(); - const u32 x = x_pixels << block_width; - const u32 y = 8U << block_height; - return {x, y}; -} - -std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const { - const auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; - const u32 block_size = GetBlockSize(); - const u32 block_index = offset / block_size; - const u32 gob_offset = offset % block_size; - const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GOB_SIZE); - const u32 x_gob_pixels = 64U / GetBytesPerPixel(); - const u32 x_block_pixels = x_gob_pixels << block_width; - const u32 y_block_pixels = 8U << block_height; - const u32 z_block_pixels = 1U << block_depth; - const u32 x_blocks = div_ceil(width, x_block_pixels); - const u32 y_blocks = div_ceil(height, y_block_pixels); - const u32 z_blocks = div_ceil(depth, z_block_pixels); - const u32 base_x = block_index % x_blocks; - const u32 base_y = (block_index / x_blocks) % y_blocks; - const u32 base_z = (block_index / (x_blocks * y_blocks)) % z_blocks; - u32 x = base_x * x_block_pixels; - u32 y = base_y * y_block_pixels; - u32 z = base_z * z_block_pixels; - z += gob_index >> block_height; - y += (gob_index * 8U) % y_block_pixels; - return {x, y, z}; -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h deleted file mode 100644 index 118aa689e..000000000 --- a/src/video_core/texture_cache/surface_params.h +++ /dev/null @@ -1,293 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <utility> - -#include "common/alignment.h" -#include "common/bit_util.h" -#include "common/cityhash.h" -#include "common/common_types.h" -#include "video_core/engines/fermi_2d.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/shader/shader_ir.h" -#include "video_core/surface.h" -#include "video_core/textures/decoders.h" - -namespace VideoCommon { - -class FormatLookupTable; - -class SurfaceParams { -public: - /// Creates SurfaceCachedParams from a texture configuration. - static SurfaceParams CreateForTexture(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry); - - /// Creates SurfaceCachedParams from an image configuration. - static SurfaceParams CreateForImage(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry); - - /// Creates SurfaceCachedParams for a depth buffer configuration. - static SurfaceParams CreateForDepthBuffer(Core::System& system); - - /// Creates SurfaceCachedParams from a framebuffer configuration. - static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index); - - /// Creates SurfaceCachedParams from a Fermi2D surface configuration. - static SurfaceParams CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config); - - /// Obtains the texture target from a shader's sampler entry. - static VideoCore::Surface::SurfaceTarget ExpectedTarget( - const VideoCommon::Shader::Sampler& entry); - - /// Obtains the texture target from a shader's sampler entry. - static VideoCore::Surface::SurfaceTarget ExpectedTarget( - const VideoCommon::Shader::Image& entry); - - std::size_t Hash() const { - return static_cast<std::size_t>( - Common::CityHash64(reinterpret_cast<const char*>(this), sizeof(*this))); - } - - bool operator==(const SurfaceParams& rhs) const; - - bool operator!=(const SurfaceParams& rhs) const { - return !operator==(rhs); - } - - std::size_t GetGuestSizeInBytes() const { - return GetInnerMemorySize(false, false, false); - } - - std::size_t GetHostSizeInBytes(bool is_converted) const { - if (!is_converted) { - return GetInnerMemorySize(true, false, false); - } - // ASTC is uncompressed in software, in emulated as RGBA8 - std::size_t host_size_in_bytes = 0; - for (u32 level = 0; level < num_levels; ++level) { - host_size_in_bytes += GetConvertedMipmapSize(level) * GetNumLayers(); - } - return host_size_in_bytes; - } - - u32 GetBlockAlignedWidth() const { - return Common::AlignUp(width, 64 / GetBytesPerPixel()); - } - - /// Returns the width of a given mipmap level. - u32 GetMipWidth(u32 level) const { - return std::max(1U, width >> level); - } - - /// Returns the height of a given mipmap level. - u32 GetMipHeight(u32 level) const { - return std::max(1U, height >> level); - } - - /// Returns the depth of a given mipmap level. - u32 GetMipDepth(u32 level) const { - return is_layered ? depth : std::max(1U, depth >> level); - } - - /// Returns the block height of a given mipmap level. - u32 GetMipBlockHeight(u32 level) const; - - /// Returns the block depth of a given mipmap level. - u32 GetMipBlockDepth(u32 level) const; - - /// Returns the best possible row/pitch alignment for the surface. - u32 GetRowAlignment(u32 level, bool is_converted) const { - const u32 bpp = is_converted ? 4 : GetBytesPerPixel(); - return 1U << Common::CountTrailingZeroes32(GetMipWidth(level) * bpp); - } - - /// Returns the offset in bytes in guest memory of a given mipmap level. - std::size_t GetGuestMipmapLevelOffset(u32 level) const; - - /// Returns the offset in bytes in host memory (linear) of a given mipmap level. - std::size_t GetHostMipmapLevelOffset(u32 level, bool is_converted) const; - - /// Returns the size in bytes in guest memory of a given mipmap level. - std::size_t GetGuestMipmapSize(u32 level) const { - return GetInnerMipmapMemorySize(level, false, false); - } - - /// Returns the size in bytes in host memory (linear) of a given mipmap level. - std::size_t GetHostMipmapSize(u32 level) const { - return GetInnerMipmapMemorySize(level, true, false) * GetNumLayers(); - } - - std::size_t GetConvertedMipmapSize(u32 level) const; - - /// Get this texture Tegra Block size in guest memory layout - u32 GetBlockSize() const; - - /// Get X, Y coordinates max sizes of a single block. - std::pair<u32, u32> GetBlockXY() const; - - /// Get the offset in x, y, z coordinates from a memory offset - std::tuple<u32, u32, u32> GetBlockOffsetXYZ(u32 offset) const; - - /// Returns the size of a layer in bytes in guest memory. - std::size_t GetGuestLayerSize() const { - return GetLayerSize(false, false); - } - - /// Returns the size of a layer in bytes in host memory for a given mipmap level. - std::size_t GetHostLayerSize(u32 level) const { - ASSERT(target != VideoCore::Surface::SurfaceTarget::Texture3D); - return GetInnerMipmapMemorySize(level, true, false); - } - - /// Returns the max possible mipmap that the texture can have in host gpu - u32 MaxPossibleMipmap() const { - const u32 max_mipmap_w = Common::Log2Ceil32(width) + 1U; - const u32 max_mipmap_h = Common::Log2Ceil32(height) + 1U; - const u32 max_mipmap = std::max(max_mipmap_w, max_mipmap_h); - if (target != VideoCore::Surface::SurfaceTarget::Texture3D) - return max_mipmap; - return std::max(max_mipmap, Common::Log2Ceil32(depth) + 1U); - } - - /// Returns if the guest surface is a compressed surface. - bool IsCompressed() const { - return GetDefaultBlockHeight() > 1 || GetDefaultBlockWidth() > 1; - } - - /// Returns the default block width. - u32 GetDefaultBlockWidth() const { - return VideoCore::Surface::GetDefaultBlockWidth(pixel_format); - } - - /// Returns the default block height. - u32 GetDefaultBlockHeight() const { - return VideoCore::Surface::GetDefaultBlockHeight(pixel_format); - } - - /// Returns the bits per pixel. - u32 GetBitsPerPixel() const { - return VideoCore::Surface::GetFormatBpp(pixel_format); - } - - /// Returns the bytes per pixel. - u32 GetBytesPerPixel() const { - return VideoCore::Surface::GetBytesPerPixel(pixel_format); - } - - /// Returns true if the pixel format is a depth and/or stencil format. - bool IsPixelFormatZeta() const { - return pixel_format >= VideoCore::Surface::PixelFormat::MaxColorFormat && - pixel_format < VideoCore::Surface::PixelFormat::MaxDepthStencilFormat; - } - - /// Returns is the surface is a TextureBuffer type of surface. - bool IsBuffer() const { - return target == VideoCore::Surface::SurfaceTarget::TextureBuffer; - } - - /// Returns the number of layers in the surface. - std::size_t GetNumLayers() const { - return is_layered ? depth : 1; - } - - /// Returns the debug name of the texture for use in graphic debuggers. - std::string TargetName() const; - - // Helper used for out of class size calculations - static std::size_t AlignLayered(const std::size_t out_size, const u32 block_height, - const u32 block_depth) { - return Common::AlignBits(out_size, - Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); - } - - /// Converts a width from a type of surface into another. This helps represent the - /// equivalent value between compressed/non-compressed textures. - static u32 ConvertWidth(u32 width, VideoCore::Surface::PixelFormat pixel_format_from, - VideoCore::Surface::PixelFormat pixel_format_to) { - const u32 bw1 = VideoCore::Surface::GetDefaultBlockWidth(pixel_format_from); - const u32 bw2 = VideoCore::Surface::GetDefaultBlockWidth(pixel_format_to); - return (width * bw2 + bw1 - 1) / bw1; - } - - /// Converts a height from a type of surface into another. This helps represent the - /// equivalent value between compressed/non-compressed textures. - static u32 ConvertHeight(u32 height, VideoCore::Surface::PixelFormat pixel_format_from, - VideoCore::Surface::PixelFormat pixel_format_to) { - const u32 bh1 = VideoCore::Surface::GetDefaultBlockHeight(pixel_format_from); - const u32 bh2 = VideoCore::Surface::GetDefaultBlockHeight(pixel_format_to); - return (height * bh2 + bh1 - 1) / bh1; - } - - // Finds the maximun possible width between 2 2D layers of different formats - static u32 IntersectWidth(const SurfaceParams& src_params, const SurfaceParams& dst_params, - const u32 src_level, const u32 dst_level) { - const u32 bw1 = src_params.GetDefaultBlockWidth(); - const u32 bw2 = dst_params.GetDefaultBlockWidth(); - const u32 t_src_width = (src_params.GetMipWidth(src_level) * bw2 + bw1 - 1) / bw1; - const u32 t_dst_width = (dst_params.GetMipWidth(dst_level) * bw1 + bw2 - 1) / bw2; - return std::min(t_src_width, t_dst_width); - } - - // Finds the maximun possible height between 2 2D layers of different formats - static u32 IntersectHeight(const SurfaceParams& src_params, const SurfaceParams& dst_params, - const u32 src_level, const u32 dst_level) { - const u32 bh1 = src_params.GetDefaultBlockHeight(); - const u32 bh2 = dst_params.GetDefaultBlockHeight(); - const u32 t_src_height = (src_params.GetMipHeight(src_level) * bh2 + bh1 - 1) / bh1; - const u32 t_dst_height = (dst_params.GetMipHeight(dst_level) * bh1 + bh2 - 1) / bh2; - return std::min(t_src_height, t_dst_height); - } - - bool is_tiled; - bool srgb_conversion; - bool is_layered; - u32 block_width; - u32 block_height; - u32 block_depth; - u32 tile_width_spacing; - u32 width; - u32 height; - u32 depth; - u32 pitch; - u32 num_levels; - u32 emulated_levels; - VideoCore::Surface::PixelFormat pixel_format; - VideoCore::Surface::SurfaceType type; - VideoCore::Surface::SurfaceTarget target; - -private: - /// Returns the size of a given mipmap level inside a layer. - std::size_t GetInnerMipmapMemorySize(u32 level, bool as_host_size, bool uncompressed) const; - - /// Returns the size of all mipmap levels and aligns as needed. - std::size_t GetInnerMemorySize(bool as_host_size, bool layer_only, bool uncompressed) const { - return GetLayerSize(as_host_size, uncompressed) * - (layer_only ? 1U : (is_layered ? depth : 1U)); - } - - /// Returns the size of a layer - std::size_t GetLayerSize(bool as_host_size, bool uncompressed) const; - - /// Returns true if these parameters are from a layered surface. - bool IsLayered() const; -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::SurfaceParams> { - std::size_t operator()(const VideoCommon::SurfaceParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std diff --git a/src/video_core/texture_cache/surface_view.cpp b/src/video_core/texture_cache/surface_view.cpp deleted file mode 100644 index 6b5f5984b..000000000 --- a/src/video_core/texture_cache/surface_view.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <tuple> - -#include "common/common_types.h" -#include "video_core/texture_cache/surface_view.h" - -namespace VideoCommon { - -std::size_t ViewParams::Hash() const { - return static_cast<std::size_t>(base_layer) ^ (static_cast<std::size_t>(num_layers) << 16) ^ - (static_cast<std::size_t>(base_level) << 24) ^ - (static_cast<std::size_t>(num_levels) << 32) ^ (static_cast<std::size_t>(target) << 36); -} - -bool ViewParams::operator==(const ViewParams& rhs) const { - return std::tie(base_layer, num_layers, base_level, num_levels, target) == - std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels, rhs.target); -} - -bool ViewParams::operator!=(const ViewParams& rhs) const { - return !operator==(rhs); -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_view.h b/src/video_core/texture_cache/surface_view.h deleted file mode 100644 index 90a8bb0ae..000000000 --- a/src/video_core/texture_cache/surface_view.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <functional> - -#include "common/common_types.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/surface_params.h" - -namespace VideoCommon { - -struct ViewParams { - constexpr explicit ViewParams(VideoCore::Surface::SurfaceTarget target, u32 base_layer, - u32 num_layers, u32 base_level, u32 num_levels) - : target{target}, base_layer{base_layer}, num_layers{num_layers}, base_level{base_level}, - num_levels{num_levels} {} - - std::size_t Hash() const; - - bool operator==(const ViewParams& rhs) const; - bool operator!=(const ViewParams& rhs) const; - - bool IsLayered() const { - switch (target) { - case VideoCore::Surface::SurfaceTarget::Texture1DArray: - case VideoCore::Surface::SurfaceTarget::Texture2DArray: - case VideoCore::Surface::SurfaceTarget::TextureCubemap: - case VideoCore::Surface::SurfaceTarget::TextureCubeArray: - return true; - default: - return false; - } - } - - VideoCore::Surface::SurfaceTarget target{}; - u32 base_layer{}; - u32 num_layers{}; - u32 base_level{}; - u32 num_levels{}; -}; - -class ViewBase { -public: - constexpr explicit ViewBase(const ViewParams& params) : params{params} {} - - constexpr const ViewParams& GetViewParams() const { - return params; - } - -protected: - ViewParams params; -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::ViewParams> { - std::size_t operator()(const VideoCommon::ViewParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 96c4e4cc2..b1da69971 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -6,1304 +6,1446 @@ #include <algorithm> #include <array> -#include <list> +#include <bit> #include <memory> #include <mutex> -#include <set> -#include <tuple> +#include <optional> +#include <span> +#include <type_traits> #include <unordered_map> +#include <utility> #include <vector> #include <boost/container/small_vector.hpp> -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range.hpp> -#include "common/assert.h" +#include "common/alignment.h" +#include "common/common_funcs.h" #include "common/common_types.h" -#include "common/math_util.h" -#include "core/core.h" -#include "core/memory.h" -#include "core/settings.h" +#include "common/logging/log.h" #include "video_core/compatible_formats.h" +#include "video_core/delayed_destruction_ring.h" #include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/surface.h" -#include "video_core/texture_cache/copy_params.h" +#include "video_core/texture_cache/descriptor_table.h" #include "video_core/texture_cache/format_lookup_table.h" -#include "video_core/texture_cache/surface_base.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/texture_cache/surface_view.h" - -namespace Tegra::Texture { -struct FullTextureInfo; -} - -namespace VideoCore { -class RasterizerInterface; -} +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/render_targets.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/texture.h" namespace VideoCommon { -using VideoCore::Surface::FormatCompatibility; +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; +using VideoCore::Surface::GetFormatType; +using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; -using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; +using VideoCore::Surface::PixelFormatFromDepthFormat; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; +using VideoCore::Surface::SurfaceType; -template <typename TSurface, typename TView> +template <class P> class TextureCache { - using VectorSurface = boost::container::small_vector<TSurface, 1>; + /// Address shift for caching images into a hash table + static constexpr u64 PAGE_BITS = 20; + + /// Enables debugging features to the texture cache + static constexpr bool ENABLE_VALIDATION = P::ENABLE_VALIDATION; + /// Implement blits as copies between framebuffers + static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; + /// True when some copies have to be emulated + static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; + + /// Image view ID for null descriptors + static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; + /// Sampler ID for bugged sampler ids + static constexpr SamplerId NULL_SAMPLER_ID{0}; + + using Runtime = typename P::Runtime; + using Image = typename P::Image; + using ImageAlloc = typename P::ImageAlloc; + using ImageView = typename P::ImageView; + using Sampler = typename P::Sampler; + using Framebuffer = typename P::Framebuffer; + + struct BlitImages { + ImageId dst_id; + ImageId src_id; + PixelFormat dst_format; + PixelFormat src_format; + }; + + template <typename T> + struct IdentityHash { + [[nodiscard]] size_t operator()(T value) const noexcept { + return static_cast<size_t>(value); + } + }; public: - void InvalidateRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + explicit TextureCache(Runtime&, VideoCore::RasterizerInterface&, Tegra::Engines::Maxwell3D&, + Tegra::Engines::KeplerCompute&, Tegra::MemoryManager&); - for (const auto& surface : GetSurfacesInRegion(addr, size)) { - Unregister(surface); - } - } + /// Notify the cache that a new frame has been queued + void TickFrame(); - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Return a constant reference to the given image view id + [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; - for (const auto& surface : GetSurfacesInRegion(addr, size)) { - if (surface->IsMemoryMarked()) { - UnmarkMemory(surface); - surface->SetSyncPending(true); - marked_for_unregister.emplace_back(surface); - } - } - } + /// Return a reference to the given image view id + [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept; - void SyncGuestHost() { - std::lock_guard lock{mutex}; + /// Fill image_view_ids with the graphics images in indices + void FillGraphicsImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids); - for (const auto& surface : marked_for_unregister) { - if (surface->IsRegistered()) { - surface->SetSyncPending(false); - Unregister(surface); - } - } - marked_for_unregister.clear(); - } + /// Fill image_view_ids with the compute images in indices + void FillComputeImageViews(std::span<const u32> indices, std::span<ImageViewId> image_view_ids); - /** - * Guarantees that rendertargets don't unregister themselves if the - * collide. Protection is currently only done on 3D slices. - */ - void GuardRenderTargets(bool new_guard) { - guard_render_targets = new_guard; - } + /// Get the sampler from the graphics descriptor table in the specified index + Sampler* GetGraphicsSampler(u32 index); - void GuardSamplers(bool new_guard) { - guard_samplers = new_guard; - } + /// Get the sampler from the compute descriptor table in the specified index + Sampler* GetComputeSampler(u32 index); - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Refresh the state for graphics image view and sampler descriptors + void SynchronizeGraphicsDescriptors(); - auto surfaces = GetSurfacesInRegion(addr, size); - if (surfaces.empty()) { - return; - } - std::sort(surfaces.begin(), surfaces.end(), [](const TSurface& a, const TSurface& b) { - return a->GetModificationTick() < b->GetModificationTick(); - }); - for (const auto& surface : surfaces) { - mutex.unlock(); - FlushSurface(surface); - mutex.lock(); - } - } + /// Refresh the state for compute image view and sampler descriptors + void SynchronizeComputeDescriptors(); - bool MustFlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Update bound render targets and upload memory if necessary + /// @param is_clear True when the render targets are being used for clears + void UpdateRenderTargets(bool is_clear); - const auto surfaces = GetSurfacesInRegion(addr, size); - return std::any_of(surfaces.cbegin(), surfaces.cend(), - [](const TSurface& surface) { return surface->IsModified(); }); - } + /// Find a framebuffer with the currently bound render targets + /// UpdateRenderTargets should be called before this + Framebuffer* GetFramebuffer(); - TView GetTextureSurface(const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry) { - std::lock_guard lock{mutex}; - const auto gpu_addr{tic.Address()}; - if (!gpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Mark images in a range as modified from the CPU + void WriteMemory(VAddr cpu_addr, size_t size); - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Download contents of host images to guest memory in a region + void DownloadMemory(VAddr cpu_addr, size_t size); - if (!IsTypeCompatible(tic.texture_type, entry)) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Remove images in a region + void UnmapMemory(VAddr cpu_addr, size_t size); - const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); - if (guard_samplers) { - sampled_textures.push_back(surface); - } - return view; - } + /// Blit an image with the given parameters + void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Config& copy); - TView GetImageSurface(const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry) { - std::lock_guard lock{mutex}; - const auto gpu_addr{tic.Address()}; - if (!gpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } - const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); - if (guard_samplers) { - sampled_textures.push_back(surface); - } - return view; - } + /// Invalidate the contents of the color buffer index + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateColorBuffer(size_t index); - bool TextureBarrier() { - const bool any_rt = - std::any_of(sampled_textures.begin(), sampled_textures.end(), - [](const auto& surface) { return surface->IsRenderTarget(); }); - sampled_textures.clear(); - return any_rt; - } + /// Invalidate the contents of the depth buffer + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateDepthBuffer(); - TView GetDepthBufferSurface(bool preserve_contents) { - std::lock_guard lock{mutex}; - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { - return depth_buffer.view; - } - maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; + /// Try to find a cached image view in the given CPU address + [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr); - const auto& regs{maxwell3d.regs}; - const auto gpu_addr{regs.zeta.Address()}; - if (!gpu_addr || !regs.zeta_enable) { - SetEmptyDepthBuffer(); - return {}; - } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - SetEmptyDepthBuffer(); - return {}; - } - const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; - auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true); - if (depth_buffer.target) - depth_buffer.target->MarkAsRenderTarget(false, NO_RT); - depth_buffer.target = surface_view.first; - depth_buffer.view = surface_view.second; - if (depth_buffer.target) - depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT); - return surface_view.second; - } - - TView GetColorBufferSurface(std::size_t index, bool preserve_contents) { - std::lock_guard lock{mutex}; - ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) { - return render_targets[index].view; - } - maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = false; + /// Return true when there are uncommitted images to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - const auto& regs{maxwell3d.regs}; - if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || - regs.rt[index].format == Tegra::RenderTargetFormat::NONE) { - SetEmptyColorBuffer(index); - return {}; - } + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - const auto& config{regs.rt[index]}; - const auto gpu_addr{config.Address()}; - if (!gpu_addr) { - SetEmptyColorBuffer(index); - return {}; - } + /// Commit asynchronous downloads + void CommitAsyncFlushes(); - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - SetEmptyColorBuffer(index); - return {}; - } + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - auto surface_view = - GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(system, index), - preserve_contents, true); - if (render_targets[index].target) { - auto& surface = render_targets[index].target; - surface->MarkAsRenderTarget(false, NO_RT); - const auto& cr_params = surface->GetSurfaceParams(); - if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - AsyncFlushSurface(surface); + std::mutex mutex; + +private: + /// Iterate over all page indices in a range + template <typename Func> + static void ForEachPage(VAddr addr, size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>; + const u64 page_end = (addr + size - 1) >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { + if constexpr (RETURNS_BOOL) { + if (func(page)) { + break; + } + } else { + func(page); } } - render_targets[index].target = surface_view.first; - render_targets[index].view = surface_view.second; - if (render_targets[index].target) - render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index)); - return surface_view.second; } - void MarkColorBufferInUse(std::size_t index) { - if (auto& render_target = render_targets[index].target) { - render_target->MarkAsModified(true, Tick()); - } - } + /// Fills image_view_ids in the image views in indices + void FillImageViews(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices, + std::span<ImageViewId> image_view_ids); - void MarkDepthBufferInUse() { - if (depth_buffer.target) { - depth_buffer.target->MarkAsModified(true, Tick()); - } - } + /// Find or create an image view in the guest descriptor table + ImageViewId VisitImageView(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, u32 index); - void SetEmptyDepthBuffer() { - if (depth_buffer.target == nullptr) { - return; - } - depth_buffer.target->MarkAsRenderTarget(false, NO_RT); - depth_buffer.target = nullptr; - depth_buffer.view = nullptr; - } + /// Find or create a framebuffer with the given render target parameters + FramebufferId GetFramebufferId(const RenderTargets& key); - void SetEmptyColorBuffer(std::size_t index) { - if (render_targets[index].target == nullptr) { - return; - } - render_targets[index].target->MarkAsRenderTarget(false, NO_RT); - render_targets[index].target = nullptr; - render_targets[index].view = nullptr; - } - - void DoFermiCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src_config, - const Tegra::Engines::Fermi2D::Regs::Surface& dst_config, - const Tegra::Engines::Fermi2D::Config& copy_config) { - std::lock_guard lock{mutex}; - SurfaceParams src_params = SurfaceParams::CreateForFermiCopySurface(src_config); - SurfaceParams dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config); - const GPUVAddr src_gpu_addr = src_config.Address(); - const GPUVAddr dst_gpu_addr = dst_config.Address(); - DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); - - const auto& memory_manager = system.GPU().MemoryManager(); - const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr); - const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr); - std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); - TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; - ImageBlit(src_surface, dst_surface.second, copy_config); - dst_surface.first->MarkAsModified(true, Tick()); - } - - TSurface TryFindFramebufferSurface(VAddr addr) const { - if (!addr) { - return nullptr; - } - const VAddr page = addr >> registry_page_bits; - const auto it = registry.find(page); - if (it == registry.end()) { - return nullptr; - } - const auto& list = it->second; - const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) { - return surface->GetCpuAddr() == addr; - }); - return found != list.end() ? *found : nullptr; - } + /// Refresh the contents (pixel data) of an image + void RefreshContents(Image& image); - u64 Tick() { - return ++ticks; - } + /// Upload data from guest to an image + template <typename StagingBuffer> + void UploadImageContents(Image& image, StagingBuffer& staging_buffer); - void CommitAsyncFlushes() { - committed_flushes.push_back(uncommitted_flushes); - uncommitted_flushes.reset(); - } + /// Find or create an image view from a guest descriptor + [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); - bool HasUncommittedFlushes() const { - return uncommitted_flushes != nullptr; - } + /// Create a new image view from a guest descriptor + [[nodiscard]] ImageViewId CreateImageView(const TICEntry& config); - bool ShouldWaitAsyncFlushes() const { - return !committed_flushes.empty() && committed_flushes.front() != nullptr; - } + /// Find or create an image from the given parameters + [[nodiscard]] ImageId FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options = RelaxedOptions{}); - void PopAsyncFlushes() { - if (committed_flushes.empty()) { - return; - } - auto& flush_list = committed_flushes.front(); - if (!flush_list) { - committed_flushes.pop_front(); - return; - } - for (TSurface& surface : *flush_list) { - FlushSurface(surface); - } - committed_flushes.pop_front(); - } + /// Find an image from the given parameters + [[nodiscard]] ImageId FindImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options); -protected: - explicit TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - bool is_astc_supported) - : system{system}, is_astc_supported{is_astc_supported}, rasterizer{rasterizer} { - for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - SetEmptyColorBuffer(i); - } + /// Create an image from the given parameters + [[nodiscard]] ImageId InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options); - SetEmptyDepthBuffer(); - staging_cache.SetSize(2); + /// Create a new image and join perfectly matching existing images + /// Remove joined images from the cache + [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - const auto make_siblings = [this](PixelFormat a, PixelFormat b) { - siblings_table[static_cast<std::size_t>(a)] = b; - siblings_table[static_cast<std::size_t>(b)] = a; - }; - std::fill(siblings_table.begin(), siblings_table.end(), PixelFormat::Invalid); - make_siblings(PixelFormat::D16_UNORM, PixelFormat::R16_UNORM); - make_siblings(PixelFormat::D32_FLOAT, PixelFormat::R32_FLOAT); - make_siblings(PixelFormat::D32_FLOAT_S8_UINT, PixelFormat::R32G32_FLOAT); + /// Return a blit image pair from the given guest blit parameters + [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src); - sampled_textures.reserve(64); - } + /// Find or create a sampler from a guest descriptor sampler + [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); - ~TextureCache() = default; + /// Find or create an image view for the given color buffer index + [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear); - virtual TSurface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) = 0; + /// Find or create an image view for the depth buffer + [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear); - virtual void ImageCopy(TSurface& src_surface, TSurface& dst_surface, - const CopyParams& copy_params) = 0; + /// Find or create a view for a render target with the given image parameters + [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, + bool is_clear); - virtual void ImageBlit(TView& src_view, TView& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) = 0; + /// Iterates over all the images in a region calling func + template <typename Func> + void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func); - // Depending on the backend, a buffer copy can be slow as it means deoptimizing the texture - // and reading it from a separate buffer. - virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0; + /// Find or create an image view in the given image with the passed parameters + [[nodiscard]] ImageViewId FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info); - void ManageRenderTargetUnregister(TSurface& surface) { - auto& dirty = system.GPU().Maxwell3D().dirty; - const u32 index = surface->GetRenderTarget(); - if (index == DEPTH_RT) { - dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true; - } else { - dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = true; - } - dirty.flags[VideoCommon::Dirty::RenderTargets] = true; + /// Register image in the page table + void RegisterImage(ImageId image); + + /// Unregister image from the page table + void UnregisterImage(ImageId image); + + /// Track CPU reads and writes for image + void TrackImage(ImageBase& image); + + /// Stop tracking CPU reads and writes for image + void UntrackImage(ImageBase& image); + + /// Delete image from the cache + void DeleteImage(ImageId image); + + /// Remove image views references from the cache + void RemoveImageViewReferences(std::span<const ImageViewId> removed_views); + + /// Remove framebuffers using the given image views from the cache + void RemoveFramebuffers(std::span<const ImageViewId> removed_views); + + /// Mark an image as modified from the GPU + void MarkModification(ImageBase& image) noexcept; + + /// Synchronize image aliases, copying data if needed + void SynchronizeAliases(ImageId image_id); + + /// Prepare an image to be used + void PrepareImage(ImageId image_id, bool is_modification, bool invalidate); + + /// Prepare an image view to be used + void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate); + + /// Execute copies from one image to the other, even if they are incompatible + void CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies); + + /// Bind an image view as render target, downloading resources preemtively if needed + void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id); + + /// Create a render target from a given image and image view parameters + [[nodiscard]] std::pair<FramebufferId, ImageViewId> RenderTargetFromImage( + ImageId, const ImageViewInfo& view_info); + + /// Returns true if the current clear parameters clear the whole image of a given image view + [[nodiscard]] bool IsFullClear(ImageViewId id); + + Runtime& runtime; + VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + + DescriptorTable<TICEntry> graphics_image_table{gpu_memory}; + DescriptorTable<TSCEntry> graphics_sampler_table{gpu_memory}; + std::vector<SamplerId> graphics_sampler_ids; + std::vector<ImageViewId> graphics_image_view_ids; + + DescriptorTable<TICEntry> compute_image_table{gpu_memory}; + DescriptorTable<TSCEntry> compute_sampler_table{gpu_memory}; + std::vector<SamplerId> compute_sampler_ids; + std::vector<ImageViewId> compute_image_view_ids; + + RenderTargets render_targets; + + std::unordered_map<TICEntry, ImageViewId> image_views; + std::unordered_map<TSCEntry, SamplerId> samplers; + std::unordered_map<RenderTargets, FramebufferId> framebuffers; + + std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; + + bool has_deleted_images = false; + + SlotVector<Image> slot_images; + SlotVector<ImageView> slot_image_views; + SlotVector<ImageAlloc> slot_image_allocs; + SlotVector<Sampler> slot_samplers; + SlotVector<Framebuffer> slot_framebuffers; + + // TODO: This data structure is not optimal and it should be reworked + std::vector<ImageId> uncommitted_downloads; + std::queue<std::vector<ImageId>> committed_downloads; + + static constexpr size_t TICKS_TO_DESTROY = 6; + DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images; + DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view; + DelayedDestructionRing<Framebuffer, TICKS_TO_DESTROY> sentenced_framebuffers; + + std::unordered_map<GPUVAddr, ImageAllocId> image_allocs_table; + + u64 modification_tick = 0; + u64 frame_tick = 0; +}; + +template <class P> +TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_) + : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_} { + // Configure null sampler + TSCEntry sampler_descriptor{}; + sampler_descriptor.min_filter.Assign(Tegra::Texture::TextureFilter::Linear); + sampler_descriptor.mag_filter.Assign(Tegra::Texture::TextureFilter::Linear); + sampler_descriptor.mipmap_filter.Assign(Tegra::Texture::TextureMipmapFilter::Linear); + sampler_descriptor.cubemap_anisotropy.Assign(1); + + // Make sure the first index is reserved for the null resources + // This way the null resource becomes a compile time constant + void(slot_image_views.insert(runtime, NullImageParams{})); + void(slot_samplers.insert(runtime, sampler_descriptor)); +} + +template <class P> +void TextureCache<P>::TickFrame() { + // Tick sentenced resources in this order to ensure they are destroyed in the right order + sentenced_images.Tick(); + sentenced_framebuffers.Tick(); + sentenced_image_view.Tick(); + ++frame_tick; +} + +template <class P> +const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept { + return slot_image_views[id]; +} + +template <class P> +typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) noexcept { + return slot_image_views[id]; +} + +template <class P> +void TextureCache<P>::FillGraphicsImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + FillImageViews(graphics_image_table, graphics_image_view_ids, indices, image_view_ids); +} + +template <class P> +void TextureCache<P>::FillComputeImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + FillImageViews(compute_image_table, compute_image_view_ids, indices, image_view_ids); +} + +template <class P> +typename P::Sampler* TextureCache<P>::GetGraphicsSampler(u32 index) { + [[unlikely]] if (index > graphics_sampler_table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid sampler index={}", index); + return &slot_samplers[NULL_SAMPLER_ID]; + } + const auto [descriptor, is_new] = graphics_sampler_table.Read(index); + SamplerId& id = graphics_sampler_ids[index]; + [[unlikely]] if (is_new) { + id = FindSampler(descriptor); } + return &slot_samplers[id]; +} - void Register(TSurface surface) { - const GPUVAddr gpu_addr = surface->GetGpuAddr(); - const std::size_t size = surface->GetSizeInBytes(); - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - LOG_CRITICAL(HW_GPU, "Failed to register surface with unmapped gpu_address 0x{:016x}", - gpu_addr); - return; - } - surface->SetCpuAddr(*cpu_addr); - RegisterInnerCache(surface); - surface->MarkAsRegistered(true); - surface->SetMemoryMarked(true); - rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1); +template <class P> +typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) { + [[unlikely]] if (index > compute_sampler_table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid sampler index={}", index); + return &slot_samplers[NULL_SAMPLER_ID]; } + const auto [descriptor, is_new] = compute_sampler_table.Read(index); + SamplerId& id = compute_sampler_ids[index]; + [[unlikely]] if (is_new) { + id = FindSampler(descriptor); + } + return &slot_samplers[id]; +} - void UnmarkMemory(TSurface surface) { - if (!surface->IsMemoryMarked()) { - return; - } - const std::size_t size = surface->GetSizeInBytes(); - const VAddr cpu_addr = surface->GetCpuAddr(); - rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); - surface->SetMemoryMarked(false); +template <class P> +void TextureCache<P>::SynchronizeGraphicsDescriptors() { + using SamplerIndex = Tegra::Engines::Maxwell3D::Regs::SamplerIndex; + const bool linked_tsc = maxwell3d.regs.sampler_index == SamplerIndex::ViaHeaderIndex; + const u32 tic_limit = maxwell3d.regs.tic.limit; + const u32 tsc_limit = linked_tsc ? tic_limit : maxwell3d.regs.tsc.limit; + if (graphics_sampler_table.Synchornize(maxwell3d.regs.tsc.Address(), tsc_limit)) { + graphics_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); + } + if (graphics_image_table.Synchornize(maxwell3d.regs.tic.Address(), tic_limit)) { + graphics_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); } +} - void Unregister(TSurface surface) { - if (guard_render_targets && surface->IsProtected()) { - return; - } - if (!guard_render_targets && surface->IsRenderTarget()) { - ManageRenderTargetUnregister(surface); - } - UnmarkMemory(surface); - if (surface->IsSyncPending()) { - marked_for_unregister.remove(surface); - surface->SetSyncPending(false); - } - UnregisterInnerCache(surface); - surface->MarkAsRegistered(false); - ReserveSurface(surface->GetSurfaceParams(), surface); +template <class P> +void TextureCache<P>::SynchronizeComputeDescriptors() { + const bool linked_tsc = kepler_compute.launch_description.linked_tsc; + const u32 tic_limit = kepler_compute.regs.tic.limit; + const u32 tsc_limit = linked_tsc ? tic_limit : kepler_compute.regs.tsc.limit; + const GPUVAddr tsc_gpu_addr = kepler_compute.regs.tsc.Address(); + if (compute_sampler_table.Synchornize(tsc_gpu_addr, tsc_limit)) { + compute_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); + } + if (compute_image_table.Synchornize(kepler_compute.regs.tic.Address(), tic_limit)) { + compute_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); } +} - TSurface GetUncachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) { - if (const auto surface = TryGetReservedSurface(params); surface) { - surface->SetGpuAddr(gpu_addr); - return surface; - } - // No reserved surface available, create a new one and reserve it - auto new_surface{CreateSurface(gpu_addr, params)}; - return new_surface; +template <class P> +void TextureCache<P>::UpdateRenderTargets(bool is_clear) { + using namespace VideoCommon::Dirty; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::RenderTargets]) { + return; } + flags[Dirty::RenderTargets] = false; - Core::System& system; - const bool is_astc_supported; + // Render target control is used on all render targets, so force look ups when this one is up + const bool force = flags[Dirty::RenderTargetControl]; + flags[Dirty::RenderTargetControl] = false; -private: - enum class RecycleStrategy : u32 { - Ignore = 0, - Flush = 1, - BufferCopy = 3, - }; + for (size_t index = 0; index < NUM_RT; ++index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + if (flags[Dirty::ColorBuffer0 + index] || force) { + flags[Dirty::ColorBuffer0 + index] = false; + BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + } + PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); + } + if (flags[Dirty::ZetaBuffer] || force) { + flags[Dirty::ZetaBuffer] = false; + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + } + const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; + PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); - enum class DeductionType : u32 { - DeductionComplete, - DeductionIncomplete, - DeductionFailed, + for (size_t index = 0; index < NUM_RT; ++index) { + render_targets.draw_buffers[index] = static_cast<u8>(maxwell3d.regs.rt_control.Map(index)); + } + render_targets.size = Extent2D{ + maxwell3d.regs.render_area.width, + maxwell3d.regs.render_area.height, }; +} - struct Deduction { - DeductionType type{DeductionType::DeductionFailed}; - TSurface surface{}; +template <class P> +typename P::Framebuffer* TextureCache<P>::GetFramebuffer() { + return &slot_framebuffers[GetFramebufferId(render_targets)]; +} - bool Failed() const { - return type == DeductionType::DeductionFailed; - } +template <class P> +void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, + std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + ASSERT(indices.size() <= image_view_ids.size()); + do { + has_deleted_images = false; + std::ranges::transform(indices, image_view_ids.begin(), [&](u32 index) { + return VisitImageView(table, cached_image_view_ids, index); + }); + } while (has_deleted_images); +} - bool Incomplete() const { - return type == DeductionType::DeductionIncomplete; - } +template <class P> +ImageViewId TextureCache<P>::VisitImageView(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, + u32 index) { + if (index > table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid image view index={}", index); + return NULL_IMAGE_VIEW_ID; + } + const auto [descriptor, is_new] = table.Read(index); + ImageViewId& image_view_id = cached_image_view_ids[index]; + if (is_new) { + image_view_id = FindImageView(descriptor); + } + if (image_view_id != NULL_IMAGE_VIEW_ID) { + PrepareImageView(image_view_id, false, false); + } + return image_view_id; +} - bool IsDepth() const { - return surface->GetSurfaceParams().IsPixelFormatZeta(); - } - }; +template <class P> +FramebufferId TextureCache<P>::GetFramebufferId(const RenderTargets& key) { + const auto [pair, is_new] = framebuffers.try_emplace(key); + FramebufferId& framebuffer_id = pair->second; + if (!is_new) { + return framebuffer_id; + } + std::array<ImageView*, NUM_RT> color_buffers; + std::ranges::transform(key.color_buffer_ids, color_buffers.begin(), + [this](ImageViewId id) { return id ? &slot_image_views[id] : nullptr; }); + ImageView* const depth_buffer = + key.depth_buffer_id ? &slot_image_views[key.depth_buffer_id] : nullptr; + framebuffer_id = slot_framebuffers.insert(runtime, color_buffers, depth_buffer, key); + return framebuffer_id; +} - /** - * Takes care of selecting a proper strategy to deal with a texture recycle. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - * @param untopological Indicates to the recycler that the texture has no way - * to match the overlaps due to topological reasons. - **/ - RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, const MatchTopologyResult untopological) { - if (Settings::IsGPULevelExtreme()) { - return RecycleStrategy::Flush; - } - // 3D Textures decision - if (params.target == SurfaceTarget::Texture3D) { - return RecycleStrategy::Flush; - } - for (const auto& s : overlaps) { - const auto& s_params = s->GetSurfaceParams(); - if (s_params.target == SurfaceTarget::Texture3D) { - return RecycleStrategy::Flush; - } - } - // Untopological decision - if (untopological == MatchTopologyResult::CompressUnmatch) { - return RecycleStrategy::Flush; - } - if (untopological == MatchTopologyResult::FullMatch && !params.is_tiled) { - return RecycleStrategy::Flush; - } - return RecycleStrategy::Ignore; - } - - /** - * Used to decide what to do with textures we can't resolve in the cache It has 2 implemented - * strategies: Ignore and Flush. - * - * - Ignore: Just unregisters all the overlaps and loads the new texture. - * - Flush: Flushes all the overlaps into memory and loads the new surface from that data. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters for the new surface. - * @param gpu_addr The starting address of the new surface. - * @param preserve_contents Indicates that the new surface should be loaded from memory or left - * blank. - * @param untopological Indicates to the recycler that the texture has no way to match the - * overlaps due to topological reasons. - **/ - std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, const bool preserve_contents, - const MatchTopologyResult untopological) { - const bool do_load = preserve_contents && Settings::IsGPULevelExtreme(); - for (auto& surface : overlaps) { - Unregister(surface); - } - switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { - case RecycleStrategy::Ignore: { - return InitializeSurface(gpu_addr, params, do_load); - } - case RecycleStrategy::Flush: { - std::sort(overlaps.begin(), overlaps.end(), - [](const TSurface& a, const TSurface& b) -> bool { - return a->GetModificationTick() < b->GetModificationTick(); - }); - for (auto& surface : overlaps) { - FlushSurface(surface); - } - return InitializeSurface(gpu_addr, params, preserve_contents); +template <class P> +void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) { + ForEachImageInRegion(cpu_addr, size, [this](ImageId image_id, Image& image) { + if (True(image.flags & ImageFlagBits::CpuModified)) { + return; } - case RecycleStrategy::BufferCopy: { - auto new_surface = GetUncachedSurface(gpu_addr, params); - BufferCopy(overlaps[0], new_surface); - return {new_surface, new_surface->GetMainView()}; + image.flags |= ImageFlagBits::CpuModified; + UntrackImage(image); + }); +} + +template <class P> +void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { + std::vector<ImageId> images; + ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { + // Skip images that were not modified from the GPU + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; } - default: { - UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); - return InitializeSurface(gpu_addr, params, do_load); + // Skip images that .are. modified from the CPU + // We don't want to write sensitive data from the guest + if (True(image.flags & ImageFlagBits::CpuModified)) { + return; } + if (image.info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + return; } + image.flags &= ~ImageFlagBits::GpuModified; + images.push_back(image_id); + }); + if (images.empty()) { + return; } + std::ranges::sort(images, [this](ImageId lhs, ImageId rhs) { + return slot_images[lhs].modification_tick < slot_images[rhs].modification_tick; + }); + for (const ImageId image_id : images) { + Image& image = slot_images[image_id]; + auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(map, copies); + runtime.Finish(); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); + } +} - /** - * Takes a single surface and recreates into another that may differ in - * format, target or width alignment. - * - * @param current_surface The registered surface in the cache which we want to convert. - * @param params The new surface params which we'll use to recreate the surface. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> RebuildSurface(TSurface current_surface, const SurfaceParams& params, - bool is_render) { - const auto gpu_addr = current_surface->GetGpuAddr(); - const auto& cr_params = current_surface->GetSurfaceParams(); - TSurface new_surface; - if (cr_params.pixel_format != params.pixel_format && !is_render && - GetSiblingFormat(cr_params.pixel_format) == params.pixel_format) { - SurfaceParams new_params = params; - new_params.pixel_format = cr_params.pixel_format; - new_params.type = cr_params.type; - new_surface = GetUncachedSurface(gpu_addr, new_params); - } else { - new_surface = GetUncachedSurface(gpu_addr, params); - } - const SurfaceParams& final_params = new_surface->GetSurfaceParams(); - if (cr_params.type != final_params.type) { - if (Settings::IsGPULevelExtreme()) { - BufferCopy(current_surface, new_surface); - } - } else { - std::vector<CopyParams> bricks = current_surface->BreakDown(final_params); - for (auto& brick : bricks) { - TryCopyImage(current_surface, new_surface, brick); - } - } - Unregister(current_surface); - Register(new_surface); - new_surface->MarkAsModified(current_surface->IsModified(), Tick()); - return {new_surface, new_surface->GetMainView()}; - } - - /** - * Takes a single surface and checks with the new surface's params if it's an exact - * match, we return the main view of the registered surface. If its formats don't - * match, we rebuild the surface. We call this last method a `Mirage`. If formats - * match but the targets don't, we create an overview View of the registered surface. - * - * @param current_surface The registered surface in the cache which we want to convert. - * @param params The new surface params which we want to check. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> ManageStructuralMatch(TSurface current_surface, - const SurfaceParams& params, bool is_render) { - const bool is_mirage = !current_surface->MatchFormat(params.pixel_format); - const bool matches_target = current_surface->MatchTarget(params.target); - const auto match_check = [&]() -> std::pair<TSurface, TView> { - if (matches_target) { - return {current_surface, current_surface->GetMainView()}; - } - return {current_surface, current_surface->EmplaceOverview(params)}; - }; - if (!is_mirage) { - return match_check(); - } - if (!is_render && GetSiblingFormat(current_surface->GetFormat()) == params.pixel_format) { - return match_check(); - } - return RebuildSurface(current_surface, params, is_render); - } - - /** - * Unlike RebuildSurface where we know whether or not registered surfaces match the candidate - * in some way, we have no guarantees here. We try to see if the overlaps are sublayers/mipmaps - * of the new surface, if they all match we end up recreating a surface for them, - * else we return nothing. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - **/ - std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, - const SurfaceParams& params, - GPUVAddr gpu_addr) { - if (params.target == SurfaceTarget::Texture3D) { - return std::nullopt; - } - const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; - TSurface new_surface = GetUncachedSurface(gpu_addr, params); +template <class P> +void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { + std::vector<ImageId> deleted_images; + ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); + for (const ImageId id : deleted_images) { + Image& image = slot_images[id]; + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image); + } + UnregisterImage(id); + DeleteImage(id); + } +} - if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { - LoadSurface(new_surface); - for (const auto& surface : overlaps) { - Unregister(surface); - } - Register(new_surface); - return {{new_surface, new_surface->GetMainView()}}; - } +template <class P> +void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Config& copy) { + const BlitImages images = GetBlitImages(dst, src); + const ImageId dst_id = images.dst_id; + const ImageId src_id = images.src_id; + PrepareImage(src_id, false, false); + PrepareImage(dst_id, true, false); + + ImageBase& dst_image = slot_images[dst_id]; + const ImageBase& src_image = slot_images[src_id]; + + // TODO: Deduplicate + const std::optional dst_base = dst_image.TryFindBase(dst.Address()); + const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; + const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); + const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); + const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); + const std::array src_region{ + Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y}, + Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, + }; - std::size_t passed_tests = 0; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; - if (!mipmap_layer) { - continue; - } - const auto [base_layer, base_mipmap] = *mipmap_layer; - if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { - continue; - } - ++passed_tests; - - // Copy all mipmaps and layers - const u32 block_width = params.GetDefaultBlockWidth(); - const u32 block_height = params.GetDefaultBlockHeight(); - for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { - const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); - const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); - if (width < block_width || height < block_height) { - // Current APIs forbid copying small compressed textures, avoid errors - break; - } - const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, - src_params.depth); - TryCopyImage(surface, new_surface, copy_params); - } - } - if (passed_tests == 0) { - return std::nullopt; - } - if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { - // In Accurate GPU all tests should pass, else we recycle - return std::nullopt; - } + const std::optional src_base = src_image.TryFindBase(src.Address()); + const SubresourceRange src_range{.base = src_base.value(), .extent = {1, 1}}; + const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); + const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); + const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); + const std::array dst_region{ + Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y}, + Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y}, + }; - const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); - for (const auto& surface : overlaps) { - Unregister(surface); - } + // Always call this after src_framebuffer_id was queried, as the address might be invalidated. + Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; + if constexpr (FRAMEBUFFER_BLITS) { + // OpenGL blits from framebuffers, not images + Framebuffer* const src_framebuffer = &slot_framebuffers[src_framebuffer_id]; + runtime.BlitFramebuffer(dst_framebuffer, src_framebuffer, dst_region, src_region, + copy.filter, copy.operation); + } else { + // Vulkan can blit images, but it lacks format reinterpretations + // Provide a framebuffer in case it's necessary + ImageView& dst_view = slot_image_views[dst_view_id]; + ImageView& src_view = slot_image_views[src_view_id]; + runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter, + copy.operation); + } +} - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - return {{new_surface, new_surface->GetMainView()}}; - } - - /** - * Takes care of managing 3D textures and its slices. Does HLE methods for reconstructing the 3D - * textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of - * the HLE methods. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - * @param cpu_addr The starting address of the new surface on physical memory. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. - */ - std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps, - const SurfaceParams& params, - GPUVAddr gpu_addr, VAddr cpu_addr, - bool preserve_contents) { - if (params.target != SurfaceTarget::Texture3D) { - for (const auto& surface : overlaps) { - if (!surface->MatchTarget(params.target)) { - if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { - if (Settings::IsGPULevelExtreme()) { - return std::nullopt; - } - Unregister(surface); - return InitializeSurface(gpu_addr, params, preserve_contents); - } - return std::nullopt; - } - if (surface->GetCpuAddr() != cpu_addr) { - continue; - } - if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return std::make_pair(surface, surface->GetMainView()); - } - } - return InitializeSurface(gpu_addr, params, preserve_contents); - } +template <class P> +void TextureCache<P>::InvalidateColorBuffer(size_t index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + color_buffer_id = FindColorBuffer(index, false); + if (!color_buffer_id) { + LOG_ERROR(HW_GPU, "Invalidating invalid color buffer in index={}", index); + return; + } + // When invalidating a color buffer, the old contents are no longer relevant + ImageView& color_buffer = slot_image_views[color_buffer_id]; + Image& image = slot_images[color_buffer.image_id]; + image.flags &= ~ImageFlagBits::CpuModified; + image.flags &= ~ImageFlagBits::GpuModified; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } + runtime.InvalidateColorBuffer(color_buffer, index); +} - if (overlaps.size() == 1) { - const auto& surface = overlaps[0]; - const SurfaceParams& overlap_params = surface->GetSurfaceParams(); - // Don't attempt to render to textures with more than one level for now - // The texture has to be to the right or the sample address if we want to render to it - if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { - const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr()); - const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); - if (slice < overlap_params.depth) { - auto view = surface->Emplace3DView(slice, params.depth, 0, 1); - return std::make_pair(std::move(surface), std::move(view)); - } - } - } +template <class P> +void TextureCache<P>::InvalidateDepthBuffer() { + ImageViewId& depth_buffer_id = render_targets.depth_buffer_id; + depth_buffer_id = FindDepthBuffer(false); + if (!depth_buffer_id) { + LOG_ERROR(HW_GPU, "Invalidating invalid depth buffer"); + return; + } + // When invalidating the depth buffer, the old contents are no longer relevant + ImageBase& image = slot_images[slot_image_views[depth_buffer_id].image_id]; + image.flags &= ~ImageFlagBits::CpuModified; + image.flags &= ~ImageFlagBits::GpuModified; - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; + ImageView& depth_buffer = slot_image_views[depth_buffer_id]; + runtime.InvalidateDepthBuffer(depth_buffer); +} - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D || - src_params.height != params.height || - src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - return std::nullopt; - } - modified |= surface->IsModified(); - - const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); - const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); - const u32 width = params.width; - const u32 height = params.height; - const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); - TryCopyImage(surface, new_surface, copy_params); +template <class P> +typename P::ImageView* TextureCache<P>::TryFindFramebufferImageView(VAddr cpu_addr) { + // TODO: Properly implement this + const auto it = page_table.find(cpu_addr >> PAGE_BITS); + if (it == page_table.end()) { + return nullptr; + } + const auto& image_ids = it->second; + for (const ImageId image_id : image_ids) { + const ImageBase& image = slot_images[image_id]; + if (image.cpu_addr != cpu_addr) { + continue; } - for (const auto& surface : overlaps) { - Unregister(surface); + if (image.image_view_ids.empty()) { + continue; } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - - TView view = new_surface->GetMainView(); - return std::make_pair(std::move(new_surface), std::move(view)); - } - - /** - * Gets the starting address and parameters of a candidate surface and tries - * to find a matching surface within the cache. This is done in 3 big steps: - * - * 1. Check the 1st Level Cache in order to find an exact match, if we fail, we move to step 2. - * - * 2. Check if there are any overlaps at all, if there are none, we just load the texture from - * memory else we move to step 3. - * - * 3. Consists of figuring out the relationship between the candidate texture and the - * overlaps. We divide the scenarios depending if there's 1 or many overlaps. If - * there's many, we just try to reconstruct a new surface out of them based on the - * candidate's parameters, if we fail, we recycle. When there's only 1 overlap then we - * have to check if the candidate is a view (layer/mipmap) of the overlap or if the - * registered surface is a mipmap/layer of the candidate. In this last case we reconstruct - * a new surface. - * - * @param gpu_addr The starting address of the candidate surface. - * @param params The parameters on the candidate surface. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, - const SurfaceParams& params, bool preserve_contents, - bool is_render) { - // Step 1 - // Check Level 1 Cache for a fast structural match. If candidate surface - // matches at certain level we are pretty much done. - if (const auto iter = l1_cache.find(cpu_addr); iter != l1_cache.end()) { - TSurface& current_surface = iter->second; - const auto topological_result = current_surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - VectorSurface overlaps{current_surface}; - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - topological_result); - } + return &slot_image_views[image.image_view_ids.at(0)]; + } + return nullptr; +} - const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None) { - const auto& old_params = current_surface->GetSurfaceParams(); - const bool not_3d = params.target != SurfaceTarget::Texture3D && - old_params.target != SurfaceTarget::Texture3D; - if (not_3d || current_surface->MatchTarget(params.target)) { - if (struct_result == MatchStructureResult::FullMatch) { - return ManageStructuralMatch(current_surface, params, is_render); - } else { - return RebuildSurface(current_surface, params, is_render); - } - } - } - } +template <class P> +bool TextureCache<P>::HasUncommittedFlushes() const noexcept { + return !uncommitted_downloads.empty(); +} - // Step 2 - // Obtain all possible overlaps in the memory region - const std::size_t candidate_size = params.GetGuestSizeInBytes(); - auto overlaps{GetSurfacesInRegion(cpu_addr, candidate_size)}; +template <class P> +bool TextureCache<P>::ShouldWaitAsyncFlushes() const noexcept { + return !committed_downloads.empty() && !committed_downloads.front().empty(); +} - // If none are found, we are done. we just load the surface and create it. - if (overlaps.empty()) { - return InitializeSurface(gpu_addr, params, preserve_contents); - } +template <class P> +void TextureCache<P>::CommitAsyncFlushes() { + // This is intentionally passing the value by copy + committed_downloads.push(uncommitted_downloads); + uncommitted_downloads.clear(); +} - // Step 3 - // Now we need to figure the relationship between the texture and its overlaps - // we do a topological test to ensure we can find some relationship. If it fails - // immediately recycle the texture - for (const auto& surface : overlaps) { - const auto topological_result = surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - topological_result); - } - } +template <class P> +void TextureCache<P>::PopAsyncFlushes() { + if (committed_downloads.empty()) { + return; + } + const std::span<const ImageId> download_ids = committed_downloads.front(); + if (download_ids.empty()) { + committed_downloads.pop(); + return; + } + size_t total_size_bytes = 0; + for (const ImageId image_id : download_ids) { + total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + } + auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); + const size_t original_offset = download_map.offset; + for (const ImageId image_id : download_ids) { + Image& image = slot_images[image_id]; + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(download_map, copies); + download_map.offset += image.unswizzled_size_bytes; + } + // Wait for downloads to finish + runtime.Finish(); + + download_map.offset = original_offset; + std::span<u8> download_span = download_map.mapped_span; + for (const ImageId image_id : download_ids) { + const ImageBase& image = slot_images[image_id]; + const auto copies = FullDownloadCopies(image.info); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, download_span); + download_map.offset += image.unswizzled_size_bytes; + download_span = download_span.subspan(image.unswizzled_size_bytes); + } + committed_downloads.pop(); +} - // Manage 3D textures - if (params.block_depth > 0) { - auto surface = - Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); - if (surface) { - return *surface; - } +template <class P> +bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { + bool is_modified = false; + ForEachImageInRegion(addr, size, [&is_modified](ImageId, ImageBase& image) { + if (False(image.flags & ImageFlagBits::GpuModified)) { + return false; } + is_modified = true; + return true; + }); + return is_modified; +} - // Split cases between 1 overlap or many. - if (overlaps.size() == 1) { - TSurface current_surface = overlaps[0]; - // First check if the surface is within the overlap. If not, it means - // two things either the candidate surface is a supertexture of the overlap - // or they don't match in any known way. - if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { - const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - // Now we check if the candidate is a mipmap/layer of the overlap - std::optional<TView> view = - current_surface->EmplaceView(params, gpu_addr, candidate_size); - if (view) { - const bool is_mirage = !current_surface->MatchFormat(params.pixel_format); - if (is_mirage) { - // On a mirage view, we need to recreate the surface under this new view - // and then obtain a view again. - SurfaceParams new_params = current_surface->GetSurfaceParams(); - const u32 wh = SurfaceParams::ConvertWidth( - new_params.width, new_params.pixel_format, params.pixel_format); - const u32 hh = SurfaceParams::ConvertHeight( - new_params.height, new_params.pixel_format, params.pixel_format); - new_params.width = wh; - new_params.height = hh; - new_params.pixel_format = params.pixel_format; - std::pair<TSurface, TView> pair = - RebuildSurface(current_surface, new_params, is_render); - std::optional<TView> mirage_view = - pair.first->EmplaceView(params, gpu_addr, candidate_size); - if (mirage_view) - return {pair.first, *mirage_view}; - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - return {current_surface, *view}; - } - } else { - // If there are many overlaps, odds are they are subtextures of the candidate - // surface. We try to construct a new surface based on the candidate parameters, - // using the overlaps. If a single overlap fails, this will fail. - std::optional<std::pair<TSurface, TView>> view = - TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } - } - // We failed all the tests, recycle the overlaps into a new texture. - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - - /** - * Gets the starting address and parameters of a candidate surface and tries to find a - * matching surface within the cache that's similar to it. If there are many textures - * or the texture found if entirely incompatible, it will fail. If no texture is found, the - * blit will be unsuccessful. - * - * @param gpu_addr The starting address of the candidate surface. - * @param params The parameters on the candidate surface. - **/ - Deduction DeduceSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) { - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - - if (!cpu_addr) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } +template <class P> +void TextureCache<P>::RefreshContents(Image& image) { + if (False(image.flags & ImageFlagBits::CpuModified)) { + // Only upload modified images + return; + } + image.flags &= ~ImageFlagBits::CpuModified; + TrackImage(image); - if (const auto iter = l1_cache.find(*cpu_addr); iter != l1_cache.end()) { - TSurface& current_surface = iter->second; - const auto topological_result = current_surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } - const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None && - current_surface->MatchTarget(params.target)) { - Deduction result{}; - result.type = DeductionType::DeductionComplete; - result.surface = current_surface; - return result; - } - } + if (image.info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); + return; + } + auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); + UploadImageContents(image, staging); + runtime.InsertUploadMemoryBarrier(); +} - const std::size_t candidate_size = params.GetGuestSizeInBytes(); - auto overlaps{GetSurfacesInRegion(*cpu_addr, candidate_size)}; +template <class P> +template <typename StagingBuffer> +void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) { + const std::span<u8> mapped_span = staging.mapped_span; + const GPUVAddr gpu_addr = image.gpu_addr; + + if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { + gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes()); + const auto uploads = FullUploadSwizzles(image.info); + runtime.AccelerateImageUpload(image, staging, uploads); + } else if (True(image.flags & ImageFlagBits::Converted)) { + std::vector<u8> unswizzled_data(image.unswizzled_size_bytes); + auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data); + ConvertImage(unswizzled_data, image.info, mapped_span, copies); + image.UploadMemory(staging, copies); + } else if (image.info.type == ImageType::Buffer) { + const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)}; + image.UploadMemory(staging, copies); + } else { + const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span); + image.UploadMemory(staging, copies); + } +} - if (overlaps.empty()) { - Deduction result{}; - result.type = DeductionType::DeductionIncomplete; - return result; - } +template <class P> +ImageViewId TextureCache<P>::FindImageView(const TICEntry& config) { + if (!IsValidAddress(gpu_memory, config)) { + return NULL_IMAGE_VIEW_ID; + } + const auto [pair, is_new] = image_views.try_emplace(config); + ImageViewId& image_view_id = pair->second; + if (is_new) { + image_view_id = CreateImageView(config); + } + return image_view_id; +} - if (overlaps.size() > 1) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } else { - Deduction result{}; - result.type = DeductionType::DeductionComplete; - result.surface = overlaps[0]; - return result; - } +template <class P> +ImageViewId TextureCache<P>::CreateImageView(const TICEntry& config) { + const ImageInfo info(config); + const GPUVAddr image_gpu_addr = config.Address() - config.BaseLayer() * info.layer_stride; + const ImageId image_id = FindOrInsertImage(info, image_gpu_addr); + if (!image_id) { + return NULL_IMAGE_VIEW_ID; } + ImageBase& image = slot_images[image_id]; + const SubresourceBase base = image.TryFindBase(config.Address()).value(); + ASSERT(base.level == 0); + const ImageViewInfo view_info(config, base.layer); + const ImageViewId image_view_id = FindOrEmplaceImageView(image_id, view_info); + ImageViewBase& image_view = slot_image_views[image_view_id]; + image_view.flags |= ImageViewFlagBits::Strong; + image.flags |= ImageFlagBits::Strong; + return image_view_id; +} - /** - * Gets a null surface based on a target texture. - * @param target The target of the null surface. - */ - TView GetNullSurface(SurfaceTarget target) { - const u32 i_target = static_cast<u32>(target); - if (const auto it = invalid_cache.find(i_target); it != invalid_cache.end()) { - return it->second->GetMainView(); - } - SurfaceParams params{}; - params.target = target; - params.is_tiled = false; - params.srgb_conversion = false; - params.is_layered = - target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray || - target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray; - params.block_width = 0; - params.block_height = 0; - params.block_depth = 0; - params.tile_width_spacing = 1; - params.width = 1; - params.height = 1; - params.depth = 1; - if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) { - params.depth = 6; - } - params.pitch = 4; - params.num_levels = 1; - params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::R8_UNORM; - params.type = VideoCore::Surface::SurfaceType::ColorTexture; - auto surface = CreateSurface(0ULL, params); - invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); - surface->UploadTexture(invalid_memory); - surface->MarkAsModified(false, Tick()); - invalid_cache.emplace(i_target, surface); - return surface->GetMainView(); - } - - /** - * Gets the a source and destination starting address and parameters, - * and tries to deduce if they are supposed to be depth textures. If so, their - * parameters are modified and fixed into so. - * - * @param src_params The parameters of the candidate surface. - * @param dst_params The parameters of the destination surface. - * @param src_gpu_addr The starting address of the candidate surface. - * @param dst_gpu_addr The starting address of the destination surface. - **/ - void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params, - const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) { - auto deduced_src = DeduceSurface(src_gpu_addr, src_params); - auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params); - if (deduced_src.Failed() || deduced_dst.Failed()) { - return; +template <class P> +ImageId TextureCache<P>::FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + if (const ImageId image_id = FindImage(info, gpu_addr, options); image_id) { + return image_id; + } + return InsertImage(info, gpu_addr, options); +} + +template <class P> +ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + return ImageId{}; + } + const bool broken_views = runtime.HasBrokenTextureViewFormats(); + ImageId image_id; + const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) { + const bool strict_size = False(options & RelaxedOptions::Size) && + True(existing_image.flags & ImageFlagBits::Strong); + const ImageInfo& existing = existing_image.info; + if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && + existing.pitch == info.pitch && + IsPitchLinearSameSize(existing, info, strict_size) && + IsViewCompatible(existing.format, info.format, broken_views)) { + image_id = existing_image_id; + return true; + } + } else if (IsSubresource(info, existing_image, gpu_addr, options, broken_views)) { + image_id = existing_image_id; + return true; } + return false; + }; + ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); + return image_id; +} - const bool incomplete_src = deduced_src.Incomplete(); - const bool incomplete_dst = deduced_dst.Incomplete(); +template <class P> +ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr); + const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr); + const Image& image = slot_images[image_id]; + // Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different + const auto [it, is_new] = image_allocs_table.try_emplace(image.gpu_addr); + if (is_new) { + it->second = slot_image_allocs.insert(); + } + slot_image_allocs[it->second].images.push_back(image_id); + return image_id; +} - if (incomplete_src && incomplete_dst) { +template <class P> +ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr) { + ImageInfo new_info = info; + const size_t size_bytes = CalculateGuestSizeInBytes(new_info); + const bool broken_views = runtime.HasBrokenTextureViewFormats(); + std::vector<ImageId> overlap_ids; + std::vector<ImageId> left_aliased_ids; + std::vector<ImageId> right_aliased_ids; + ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { + if (info.type != overlap.info.type) { return; } - - const bool any_incomplete = incomplete_src || incomplete_dst; - - if (!any_incomplete) { - if (!(deduced_src.IsDepth() && deduced_dst.IsDepth())) { - return; - } - } else { - if (incomplete_src && !(deduced_dst.IsDepth())) { - return; - } - - if (incomplete_dst && !(deduced_src.IsDepth())) { - return; + if (info.type == ImageType::Linear) { + if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { + // Alias linear images with the same pitch + left_aliased_ids.push_back(overlap_id); } + return; } - - const auto inherit_format = [](SurfaceParams& to, TSurface from) { - const SurfaceParams& params = from->GetSurfaceParams(); - to.pixel_format = params.pixel_format; - to.type = params.type; - }; - // Now we got the cases where one or both is Depth and the other is not known - if (!incomplete_src) { - inherit_format(src_params, deduced_src.surface); - } else { - inherit_format(src_params, deduced_dst.surface); + static constexpr bool strict_size = true; + const std::optional<OverlapResult> solution = + ResolveOverlap(new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views); + if (solution) { + gpu_addr = solution->gpu_addr; + cpu_addr = solution->cpu_addr; + new_info.resources = solution->resources; + overlap_ids.push_back(overlap_id); + return; } - if (!incomplete_dst) { - inherit_format(dst_params, deduced_dst.surface); + static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format; + const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); + if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views)) { + left_aliased_ids.push_back(overlap_id); + } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, + broken_views)) { + right_aliased_ids.push_back(overlap_id); + } + }); + const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); + Image& new_image = slot_images[new_image_id]; + + // TODO: Only upload what we need + RefreshContents(new_image); + + for (const ImageId overlap_id : overlap_ids) { + Image& overlap = slot_images[overlap_id]; + if (overlap.info.num_samples != new_image.info.num_samples) { + LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); } else { - inherit_format(dst_params, deduced_src.surface); + const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); + const auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); + runtime.CopyImage(new_image, overlap, copies); + } + if (True(overlap.flags & ImageFlagBits::Tracked)) { + UntrackImage(overlap); } + UnregisterImage(overlap_id); + DeleteImage(overlap_id); + } + ImageBase& new_image_base = new_image; + for (const ImageId aliased_id : right_aliased_ids) { + ImageBase& aliased = slot_images[aliased_id]; + AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); } + for (const ImageId aliased_id : left_aliased_ids) { + ImageBase& aliased = slot_images[aliased_id]; + AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); + } + RegisterImage(new_image_id); + return new_image_id; +} - std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, - bool preserve_contents) { - auto new_surface{GetUncachedSurface(gpu_addr, params)}; - Register(new_surface); - if (preserve_contents) { - LoadSurface(new_surface); - } - return {new_surface, new_surface->GetMainView()}; +template <class P> +typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages( + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src) { + static constexpr auto FIND_OPTIONS = RelaxedOptions::Format | RelaxedOptions::Samples; + const GPUVAddr dst_addr = dst.Address(); + const GPUVAddr src_addr = src.Address(); + ImageInfo dst_info(dst); + ImageInfo src_info(src); + ImageId dst_id; + ImageId src_id; + do { + has_deleted_images = false; + dst_id = FindImage(dst_info, dst_addr, FIND_OPTIONS); + src_id = FindImage(src_info, src_addr, FIND_OPTIONS); + const ImageBase* const dst_image = dst_id ? &slot_images[dst_id] : nullptr; + const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr; + DeduceBlitImages(dst_info, src_info, dst_image, src_image); + if (GetFormatType(dst_info.format) != GetFormatType(src_info.format)) { + continue; + } + if (!dst_id) { + dst_id = InsertImage(dst_info, dst_addr, RelaxedOptions{}); + } + if (!src_id) { + src_id = InsertImage(src_info, src_addr, RelaxedOptions{}); + } + } while (has_deleted_images); + return BlitImages{ + .dst_id = dst_id, + .src_id = src_id, + .dst_format = dst_info.format, + .src_format = src_info.format, + }; +} + +template <class P> +SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { + if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { + return NULL_SAMPLER_ID; } + const auto [pair, is_new] = samplers.try_emplace(config); + if (is_new) { + pair->second = slot_samplers.insert(runtime, config); + } + return pair->second; +} - void LoadSurface(const TSurface& surface) { - staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); - surface->LoadBuffer(system.GPU().MemoryManager(), staging_cache); - surface->UploadTexture(staging_cache.GetBuffer(0)); - surface->MarkAsModified(false, Tick()); +template <class P> +ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { + const auto& regs = maxwell3d.regs; + if (index >= regs.rt_control.count) { + return ImageViewId{}; + } + const auto& rt = regs.rt[index]; + const GPUVAddr gpu_addr = rt.Address(); + if (gpu_addr == 0) { + return ImageViewId{}; } + if (rt.format == Tegra::RenderTargetFormat::NONE) { + return ImageViewId{}; + } + const ImageInfo info(regs, index); + return FindRenderTargetView(info, gpu_addr, is_clear); +} - void FlushSurface(const TSurface& surface) { - if (!surface->IsModified()) { - return; - } - staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); - surface->DownloadTexture(staging_cache.GetBuffer(0)); - surface->FlushBuffer(system.GPU().MemoryManager(), staging_cache); - surface->MarkAsModified(false, Tick()); - } - - void RegisterInnerCache(TSurface& surface) { - const VAddr cpu_addr = surface->GetCpuAddr(); - VAddr start = cpu_addr >> registry_page_bits; - const VAddr end = (surface->GetCpuAddrEnd() - 1) >> registry_page_bits; - l1_cache[cpu_addr] = surface; - while (start <= end) { - registry[start].push_back(surface); - start++; - } +template <class P> +ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { + const auto& regs = maxwell3d.regs; + if (!regs.zeta_enable) { + return ImageViewId{}; + } + const GPUVAddr gpu_addr = regs.zeta.Address(); + if (gpu_addr == 0) { + return ImageViewId{}; } + const ImageInfo info(regs); + return FindRenderTargetView(info, gpu_addr, is_clear); +} - void UnregisterInnerCache(TSurface& surface) { - const VAddr cpu_addr = surface->GetCpuAddr(); - VAddr start = cpu_addr >> registry_page_bits; - const VAddr end = (surface->GetCpuAddrEnd() - 1) >> registry_page_bits; - l1_cache.erase(cpu_addr); - while (start <= end) { - auto& reg{registry[start]}; - reg.erase(std::find(reg.begin(), reg.end(), surface)); - start++; - } +template <class P> +ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, + bool is_clear) { + const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; + const ImageId image_id = FindOrInsertImage(info, gpu_addr, options); + if (!image_id) { + return NULL_IMAGE_VIEW_ID; + } + Image& image = slot_images[image_id]; + const ImageViewType view_type = RenderTargetImageViewType(info); + SubresourceBase base; + if (image.info.type == ImageType::Linear) { + base = SubresourceBase{.level = 0, .layer = 0}; + } else { + base = image.TryFindBase(gpu_addr).value(); } + const s32 layers = image.info.type == ImageType::e3D ? info.size.depth : info.resources.layers; + const SubresourceRange range{ + .base = base, + .extent = {.levels = 1, .layers = layers}, + }; + return FindOrEmplaceImageView(image_id, ImageViewInfo(view_type, info.format, range)); +} - VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { - if (size == 0) { - return {}; +template <class P> +template <typename Func> +void TextureCache<P>::ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result<Func, ImageId, Image&>::type; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + boost::container::small_vector<ImageId, 32> images; + ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) { + const auto it = page_table.find(page); + if (it == page_table.end()) { + if constexpr (BOOL_BREAK) { + return false; + } else { + return; + } } - const VAddr cpu_addr_end = cpu_addr + size; - const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; - VectorSurface surfaces; - for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) { - const auto it = registry.find(start); - if (it == registry.end()) { + for (const ImageId image_id : it->second) { + Image& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::Picked)) { + continue; + } + if (!image.Overlaps(cpu_addr, size)) { continue; } - for (auto& surface : it->second) { - if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) { - continue; + image.flags |= ImageFlagBits::Picked; + images.push_back(image_id); + if constexpr (BOOL_BREAK) { + if (func(image_id, image)) { + return true; } - surface->MarkAsPicked(true); - surfaces.push_back(surface); + } else { + func(image_id, image); } } - for (auto& surface : surfaces) { - surface->MarkAsPicked(false); + if constexpr (BOOL_BREAK) { + return false; } - return surfaces; + }); + for (const ImageId image_id : images) { + slot_images[image_id].flags &= ~ImageFlagBits::Picked; } +} - void ReserveSurface(const SurfaceParams& params, TSurface surface) { - surface_reserve[params].push_back(std::move(surface)); +template <class P> +ImageViewId TextureCache<P>::FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info) { + Image& image = slot_images[image_id]; + if (const ImageViewId image_view_id = image.FindView(info); image_view_id) { + return image_view_id; } + const ImageViewId image_view_id = slot_image_views.insert(runtime, info, image_id, image); + image.InsertView(info, image_view_id); + return image_view_id; +} + +template <class P> +void TextureCache<P>::RegisterImage(ImageId image_id) { + ImageBase& image = slot_images[image_id]; + ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), + "Trying to register an already registered image"); + image.flags |= ImageFlagBits::Registered; + ForEachPage(image.cpu_addr, image.guest_size_bytes, + [this, image_id](u64 page) { page_table[page].push_back(image_id); }); +} - TSurface TryGetReservedSurface(const SurfaceParams& params) { - auto search{surface_reserve.find(params)}; - if (search == surface_reserve.end()) { - return {}; +template <class P> +void TextureCache<P>::UnregisterImage(ImageId image_id) { + Image& image = slot_images[image_id]; + ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), + "Trying to unregister an already registered image"); + image.flags &= ~ImageFlagBits::Registered; + ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { + const auto page_it = page_table.find(page); + if (page_it == page_table.end()) { + UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); + return; } - for (auto& surface : search->second) { - if (!surface->IsRegistered()) { - return surface; - } + std::vector<ImageId>& image_ids = page_it->second; + const auto vector_it = std::ranges::find(image_ids, image_id); + if (vector_it == image_ids.end()) { + UNREACHABLE_MSG("Unregistering unregistered image in page=0x{:x}", page << PAGE_BITS); + return; } - return {}; - } + image_ids.erase(vector_it); + }); +} - /// Try to do an image copy logging when formats are incompatible. - void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) { - const SurfaceParams& src_params = src->GetSurfaceParams(); - const SurfaceParams& dst_params = dst->GetSurfaceParams(); - if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) { - LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}", - static_cast<int>(dst_params.pixel_format), - static_cast<int>(src_params.pixel_format)); - return; +template <class P> +void TextureCache<P>::TrackImage(ImageBase& image) { + ASSERT(False(image.flags & ImageFlagBits::Tracked)); + image.flags |= ImageFlagBits::Tracked; + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); +} + +template <class P> +void TextureCache<P>::UntrackImage(ImageBase& image) { + ASSERT(True(image.flags & ImageFlagBits::Tracked)); + image.flags &= ~ImageFlagBits::Tracked; + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, -1); +} + +template <class P> +void TextureCache<P>::DeleteImage(ImageId image_id) { + ImageBase& image = slot_images[image_id]; + const GPUVAddr gpu_addr = image.gpu_addr; + const auto alloc_it = image_allocs_table.find(gpu_addr); + if (alloc_it == image_allocs_table.end()) { + UNREACHABLE_MSG("Trying to delete an image alloc that does not exist in address 0x{:x}", + gpu_addr); + return; + } + const ImageAllocId alloc_id = alloc_it->second; + std::vector<ImageId>& alloc_images = slot_image_allocs[alloc_id].images; + const auto alloc_image_it = std::ranges::find(alloc_images, image_id); + if (alloc_image_it == alloc_images.end()) { + UNREACHABLE_MSG("Trying to delete an image that does not exist"); + return; + } + ASSERT_MSG(False(image.flags & ImageFlagBits::Tracked), "Image was not untracked"); + ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Image was not unregistered"); + + // Mark render targets as dirty + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::RenderTargets] = true; + dirty[Dirty::ZetaBuffer] = true; + for (size_t rt = 0; rt < NUM_RT; ++rt) { + dirty[Dirty::ColorBuffer0 + rt] = true; + } + const std::span<const ImageViewId> image_view_ids = image.image_view_ids; + for (const ImageViewId image_view_id : image_view_ids) { + std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{}); + if (render_targets.depth_buffer_id == image_view_id) { + render_targets.depth_buffer_id = ImageViewId{}; } - ImageCopy(src, dst, copy); } + RemoveImageViewReferences(image_view_ids); + RemoveFramebuffers(image_view_ids); + + for (const AliasedImage& alias : image.aliased_images) { + ImageBase& other_image = slot_images[alias.id]; + [[maybe_unused]] const size_t num_removed_aliases = + std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { + return other_alias.id == image_id; + }); + ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", + num_removed_aliases); + } + for (const ImageViewId image_view_id : image_view_ids) { + sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); + slot_image_views.erase(image_view_id); + } + sentenced_images.Push(std::move(slot_images[image_id])); + slot_images.erase(image_id); - constexpr PixelFormat GetSiblingFormat(PixelFormat format) const { - return siblings_table[static_cast<std::size_t>(format)]; + alloc_images.erase(alloc_image_it); + if (alloc_images.empty()) { + image_allocs_table.erase(alloc_it); + } + if constexpr (ENABLE_VALIDATION) { + std::ranges::fill(graphics_image_view_ids, CORRUPT_ID); + std::ranges::fill(compute_image_view_ids, CORRUPT_ID); } + graphics_image_table.Invalidate(); + compute_image_table.Invalidate(); + has_deleted_images = true; +} - /// Returns true the shader sampler entry is compatible with the TIC texture type. - static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, - const VideoCommon::Shader::Sampler& entry) { - const auto shader_type = entry.type; - switch (tic_type) { - case Tegra::Texture::TextureType::Texture1D: - case Tegra::Texture::TextureType::Texture1DArray: - return shader_type == Tegra::Shader::TextureType::Texture1D; - case Tegra::Texture::TextureType::Texture1DBuffer: - // TODO(Rodrigo): Assume as valid for now - return true; - case Tegra::Texture::TextureType::Texture2D: - case Tegra::Texture::TextureType::Texture2DNoMipmap: - return shader_type == Tegra::Shader::TextureType::Texture2D; - case Tegra::Texture::TextureType::Texture2DArray: - return shader_type == Tegra::Shader::TextureType::Texture2D || - shader_type == Tegra::Shader::TextureType::TextureCube; - case Tegra::Texture::TextureType::Texture3D: - return shader_type == Tegra::Shader::TextureType::Texture3D; - case Tegra::Texture::TextureType::TextureCubeArray: - case Tegra::Texture::TextureType::TextureCubemap: - if (shader_type == Tegra::Shader::TextureType::TextureCube) { - return true; - } - return shader_type == Tegra::Shader::TextureType::Texture2D && entry.is_array; +template <class P> +void TextureCache<P>::RemoveImageViewReferences(std::span<const ImageViewId> removed_views) { + auto it = image_views.begin(); + while (it != image_views.end()) { + const auto found = std::ranges::find(removed_views, it->second); + if (found != removed_views.end()) { + it = image_views.erase(it); + } else { + ++it; } - UNREACHABLE(); - return true; } +} - struct FramebufferTargetInfo { - TSurface target; - TView view; - }; - - void AsyncFlushSurface(TSurface& surface) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared<std::list<TSurface>>(); +template <class P> +void TextureCache<P>::RemoveFramebuffers(std::span<const ImageViewId> removed_views) { + auto it = framebuffers.begin(); + while (it != framebuffers.end()) { + if (it->first.Contains(removed_views)) { + it = framebuffers.erase(it); + } else { + ++it; } - uncommitted_flushes->push_back(surface); } +} - VideoCore::RasterizerInterface& rasterizer; - - FormatLookupTable format_lookup_table; - FormatCompatibility format_compatibility; - - u64 ticks{}; - - // Guards the cache for protection conflicts. - bool guard_render_targets{}; - bool guard_samplers{}; - - // The siblings table is for formats that can inter exchange with one another - // without causing issues. This is only valid when a conflict occurs on a non - // rendering use. - std::array<PixelFormat, static_cast<std::size_t>(PixelFormat::Max)> siblings_table; - - // The internal Cache is different for the Texture Cache. It's based on buckets - // of 1MB. This fits better for the purpose of this cache as textures are normaly - // large in size. - static constexpr u64 registry_page_bits{20}; - static constexpr u64 registry_page_size{1 << registry_page_bits}; - std::unordered_map<VAddr, std::vector<TSurface>> registry; +template <class P> +void TextureCache<P>::MarkModification(ImageBase& image) noexcept { + image.flags |= ImageFlagBits::GpuModified; + image.modification_tick = ++modification_tick; +} - static constexpr u32 DEPTH_RT = 8; - static constexpr u32 NO_RT = 0xFFFFFFFF; +template <class P> +void TextureCache<P>::SynchronizeAliases(ImageId image_id) { + boost::container::small_vector<const AliasedImage*, 1> aliased_images; + ImageBase& image = slot_images[image_id]; + u64 most_recent_tick = image.modification_tick; + for (const AliasedImage& aliased : image.aliased_images) { + ImageBase& aliased_image = slot_images[aliased.id]; + if (image.modification_tick < aliased_image.modification_tick) { + most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); + aliased_images.push_back(&aliased); + } + } + if (aliased_images.empty()) { + return; + } + image.modification_tick = most_recent_tick; + std::ranges::sort(aliased_images, [this](const AliasedImage* lhs, const AliasedImage* rhs) { + const ImageBase& lhs_image = slot_images[lhs->id]; + const ImageBase& rhs_image = slot_images[rhs->id]; + return lhs_image.modification_tick < rhs_image.modification_tick; + }); + for (const AliasedImage* const aliased : aliased_images) { + CopyImage(image_id, aliased->id, aliased->copies); + } +} - // The L1 Cache is used for fast texture lookup before checking the overlaps - // This avoids calculating size and other stuffs. - std::unordered_map<VAddr, TSurface> l1_cache; +template <class P> +void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool invalidate) { + Image& image = slot_images[image_id]; + if (invalidate) { + image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified); + if (False(image.flags & ImageFlagBits::Tracked)) { + TrackImage(image); + } + } else { + RefreshContents(image); + SynchronizeAliases(image_id); + } + if (is_modification) { + MarkModification(image); + } + image.frame_tick = frame_tick; +} - /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have - /// previously been used. This is to prevent surfaces from being constantly created and - /// destroyed when used with different surface parameters. - std::unordered_map<SurfaceParams, std::vector<TSurface>> surface_reserve; - std::array<FramebufferTargetInfo, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> - render_targets; - FramebufferTargetInfo depth_buffer; +template <class P> +void TextureCache<P>::PrepareImageView(ImageViewId image_view_id, bool is_modification, + bool invalidate) { + if (!image_view_id) { + return; + } + const ImageViewBase& image_view = slot_image_views[image_view_id]; + PrepareImage(image_view.image_id, is_modification, invalidate); +} - std::vector<TSurface> sampled_textures; +template <class P> +void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies) { + Image& dst = slot_images[dst_id]; + Image& src = slot_images[src_id]; + const auto dst_format_type = GetFormatType(dst.info.format); + const auto src_format_type = GetFormatType(src.info.format); + if (src_format_type == dst_format_type) { + if constexpr (HAS_EMULATED_COPIES) { + if (!runtime.CanImageBeCopied(dst, src)) { + return runtime.EmulateCopyImage(dst, src, copies); + } + } + return runtime.CopyImage(dst, src, copies); + } + UNIMPLEMENTED_IF(dst.info.type != ImageType::e2D); + UNIMPLEMENTED_IF(src.info.type != ImageType::e2D); + for (const ImageCopy& copy : copies) { + UNIMPLEMENTED_IF(copy.dst_subresource.num_layers != 1); + UNIMPLEMENTED_IF(copy.src_subresource.num_layers != 1); + UNIMPLEMENTED_IF(copy.src_offset != Offset3D{}); + UNIMPLEMENTED_IF(copy.dst_offset != Offset3D{}); + + const SubresourceBase dst_base{ + .level = copy.dst_subresource.base_level, + .layer = copy.dst_subresource.base_layer, + }; + const SubresourceBase src_base{ + .level = copy.src_subresource.base_level, + .layer = copy.src_subresource.base_layer, + }; + const SubresourceExtent dst_extent{.levels = 1, .layers = 1}; + const SubresourceExtent src_extent{.levels = 1, .layers = 1}; + const SubresourceRange dst_range{.base = dst_base, .extent = dst_extent}; + const SubresourceRange src_range{.base = src_base, .extent = src_extent}; + const ImageViewInfo dst_view_info(ImageViewType::e2D, dst.info.format, dst_range); + const ImageViewInfo src_view_info(ImageViewType::e2D, src.info.format, src_range); + const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); + Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; + const ImageViewId src_view_id = FindOrEmplaceImageView(src_id, src_view_info); + ImageView& dst_view = slot_image_views[dst_view_id]; + ImageView& src_view = slot_image_views[src_view_id]; + [[maybe_unused]] const Extent3D expected_size{ + .width = std::min(dst_view.size.width, src_view.size.width), + .height = std::min(dst_view.size.height, src_view.size.height), + .depth = std::min(dst_view.size.depth, src_view.size.depth), + }; + UNIMPLEMENTED_IF(copy.extent != expected_size); - /// This cache stores null surfaces in order to be used as a placeholder - /// for invalid texture calls. - std::unordered_map<u32, TSurface> invalid_cache; - std::vector<u8> invalid_memory; + runtime.ConvertImage(dst_framebuffer, dst_view, src_view); + } +} - std::list<TSurface> marked_for_unregister; +template <class P> +void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) { + if (*old_id == new_id) { + return; + } + if (*old_id) { + const ImageViewBase& old_view = slot_image_views[*old_id]; + if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { + uncommitted_downloads.push_back(old_view.image_id); + } + } + *old_id = new_id; +} - std::shared_ptr<std::list<TSurface>> uncommitted_flushes{}; - std::list<std::shared_ptr<std::list<TSurface>>> committed_flushes; +template <class P> +std::pair<FramebufferId, ImageViewId> TextureCache<P>::RenderTargetFromImage( + ImageId image_id, const ImageViewInfo& view_info) { + const ImageViewId view_id = FindOrEmplaceImageView(image_id, view_info); + const ImageBase& image = slot_images[image_id]; + const bool is_color = GetFormatType(image.info.format) == SurfaceType::ColorTexture; + const ImageViewId color_view_id = is_color ? view_id : ImageViewId{}; + const ImageViewId depth_view_id = is_color ? ImageViewId{} : view_id; + const Extent3D extent = MipSize(image.info.size, view_info.range.base.level); + const u32 num_samples = image.info.num_samples; + const auto [samples_x, samples_y] = SamplesLog2(num_samples); + const FramebufferId framebuffer_id = GetFramebufferId(RenderTargets{ + .color_buffer_ids = {color_view_id}, + .depth_buffer_id = depth_view_id, + .size = {extent.width >> samples_x, extent.height >> samples_y}, + }); + return {framebuffer_id, view_id}; +} - StagingCache staging_cache; - std::recursive_mutex mutex; -}; +template <class P> +bool TextureCache<P>::IsFullClear(ImageViewId id) { + if (!id) { + return true; + } + const ImageViewBase& image_view = slot_image_views[id]; + const ImageBase& image = slot_images[image_view.image_id]; + const Extent3D size = image_view.size; + const auto& regs = maxwell3d.regs; + const auto& scissor = regs.scissor_test[0]; + if (image.info.resources.levels > 1 || image.info.resources.layers > 1) { + // Images with multiple resources can't be cleared in a single call + return false; + } + if (regs.clear_flags.scissor == 0) { + // If scissor testing is disabled, the clear is always full + return true; + } + // Make sure the clear covers all texels in the subresource + return scissor.min_x == 0 && scissor.min_y == 0 && scissor.max_x >= size.width && + scissor.max_y >= size.height; +} } // namespace VideoCommon diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h new file mode 100644 index 000000000..2ad2d72a6 --- /dev/null +++ b/src/video_core/texture_cache/types.h @@ -0,0 +1,140 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/texture_cache/slot_vector.h" + +namespace VideoCommon { + +constexpr size_t NUM_RT = 8; +constexpr size_t MAX_MIP_LEVELS = 14; + +constexpr SlotId CORRUPT_ID{0xfffffffe}; + +using ImageId = SlotId; +using ImageViewId = SlotId; +using ImageAllocId = SlotId; +using SamplerId = SlotId; +using FramebufferId = SlotId; + +enum class ImageType : u32 { + e1D, + e2D, + e3D, + Linear, + Buffer, +}; + +enum class ImageViewType : u32 { + e1D, + e2D, + Cube, + e3D, + e1DArray, + e2DArray, + CubeArray, + Rect, + Buffer, +}; +constexpr size_t NUM_IMAGE_VIEW_TYPES = 9; + +enum class RelaxedOptions : u32 { + Size = 1 << 0, + Format = 1 << 1, + Samples = 1 << 2, +}; +DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) + +struct Offset2D { + constexpr auto operator<=>(const Offset2D&) const noexcept = default; + + s32 x; + s32 y; +}; + +struct Offset3D { + constexpr auto operator<=>(const Offset3D&) const noexcept = default; + + s32 x; + s32 y; + s32 z; +}; + +struct Extent2D { + constexpr auto operator<=>(const Extent2D&) const noexcept = default; + + u32 width; + u32 height; +}; + +struct Extent3D { + constexpr auto operator<=>(const Extent3D&) const noexcept = default; + + u32 width; + u32 height; + u32 depth; +}; + +struct SubresourceLayers { + s32 base_level = 0; + s32 base_layer = 0; + s32 num_layers = 1; +}; + +struct SubresourceBase { + constexpr auto operator<=>(const SubresourceBase&) const noexcept = default; + + s32 level = 0; + s32 layer = 0; +}; + +struct SubresourceExtent { + constexpr auto operator<=>(const SubresourceExtent&) const noexcept = default; + + s32 levels = 1; + s32 layers = 1; +}; + +struct SubresourceRange { + constexpr auto operator<=>(const SubresourceRange&) const noexcept = default; + + SubresourceBase base; + SubresourceExtent extent; +}; + +struct ImageCopy { + SubresourceLayers src_subresource; + SubresourceLayers dst_subresource; + Offset3D src_offset; + Offset3D dst_offset; + Extent3D extent; +}; + +struct BufferImageCopy { + size_t buffer_offset; + size_t buffer_size; + u32 buffer_row_length; + u32 buffer_image_height; + SubresourceLayers image_subresource; + Offset3D image_offset; + Extent3D image_extent; +}; + +struct BufferCopy { + size_t src_offset; + size_t dst_offset; + size_t size; +}; + +struct SwizzleParameters { + Extent3D num_tiles; + Extent3D block; + size_t buffer_offset; + s32 level; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp new file mode 100644 index 000000000..a0bc1f7b6 --- /dev/null +++ b/src/video_core/texture_cache/util.cpp @@ -0,0 +1,1210 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +// This files contains code from Ryujinx +// A copy of the code can be obtained from https://github.com/Ryujinx/Ryujinx +// The sections using code from Ryujinx are marked with a link to the original version + +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include <array> +#include <numeric> +#include <optional> +#include <span> +#include <vector> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/compatible_formats.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/astc.h" +#include "video_core/textures/decoders.h" + +namespace VideoCommon { + +namespace { + +using Tegra::Texture::GOB_SIZE; +using Tegra::Texture::GOB_SIZE_SHIFT; +using Tegra::Texture::GOB_SIZE_X; +using Tegra::Texture::GOB_SIZE_X_SHIFT; +using Tegra::Texture::GOB_SIZE_Y; +using Tegra::Texture::GOB_SIZE_Y_SHIFT; +using Tegra::Texture::GOB_SIZE_Z; +using Tegra::Texture::GOB_SIZE_Z_SHIFT; +using Tegra::Texture::MsaaMode; +using Tegra::Texture::SwizzleTexture; +using Tegra::Texture::TextureFormat; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::UnswizzleTexture; +using VideoCore::Surface::BytesPerBlock; +using VideoCore::Surface::DefaultBlockHeight; +using VideoCore::Surface::DefaultBlockWidth; +using VideoCore::Surface::IsCopyCompatible; +using VideoCore::Surface::IsPixelFormatASTC; +using VideoCore::Surface::IsViewCompatible; +using VideoCore::Surface::PixelFormatFromDepthFormat; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; +using VideoCore::Surface::SurfaceType; + +constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); + +struct LevelInfo { + Extent3D size; + Extent3D block; + Extent2D tile_size; + u32 bpp_log2; + u32 tile_width_spacing; +}; + +[[nodiscard]] constexpr u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension) { + if (shift == 0) { + return 0; + } + u32 x = unit_factor << (shift - 1); + if (x >= dimension) { + while (--shift) { + x >>= 1; + if (x < dimension) { + break; + } + } + } + return shift; +} + +[[nodiscard]] constexpr u32 AdjustMipSize(u32 size, u32 level) { + return std::max<u32>(size >> level, 1); +} + +[[nodiscard]] constexpr Extent3D AdjustMipSize(Extent3D size, s32 level) { + return Extent3D{ + .width = AdjustMipSize(size.width, level), + .height = AdjustMipSize(size.height, level), + .depth = AdjustMipSize(size.depth, level), + }; +} + +[[nodiscard]] Extent3D AdjustSamplesSize(Extent3D size, s32 num_samples) { + const auto [samples_x, samples_y] = SamplesLog2(num_samples); + return Extent3D{ + .width = size.width >> samples_x, + .height = size.height >> samples_y, + .depth = size.depth, + }; +} + +template <u32 GOB_EXTENT> +[[nodiscard]] constexpr u32 AdjustMipBlockSize(u32 num_tiles, u32 block_size, u32 level) { + do { + while (block_size > 0 && num_tiles <= (1U << (block_size - 1)) * GOB_EXTENT) { + --block_size; + } + } while (level--); + return block_size; +} + +[[nodiscard]] constexpr Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size, + u32 level) { + return { + .width = AdjustMipBlockSize<GOB_SIZE_X>(num_tiles.width, block_size.width, level), + .height = AdjustMipBlockSize<GOB_SIZE_Y>(num_tiles.height, block_size.height, level), + .depth = AdjustMipBlockSize<GOB_SIZE_Z>(num_tiles.depth, block_size.depth, level), + }; +} + +[[nodiscard]] constexpr Extent3D AdjustTileSize(Extent3D size, Extent2D tile_size) { + return { + .width = Common::DivCeil(size.width, tile_size.width), + .height = Common::DivCeil(size.height, tile_size.height), + .depth = size.depth, + }; +} + +[[nodiscard]] constexpr u32 BytesPerBlockLog2(u32 bytes_per_block) { + return std::countl_zero(bytes_per_block) ^ 0x1F; +} + +[[nodiscard]] constexpr u32 BytesPerBlockLog2(PixelFormat format) { + return BytesPerBlockLog2(BytesPerBlock(format)); +} + +[[nodiscard]] constexpr u32 NumBlocks(Extent3D size, Extent2D tile_size) { + const Extent3D num_blocks = AdjustTileSize(size, tile_size); + return num_blocks.width * num_blocks.height * num_blocks.depth; +} + +[[nodiscard]] constexpr u32 AdjustSize(u32 size, u32 level, u32 block_size) { + return Common::DivCeil(AdjustMipSize(size, level), block_size); +} + +[[nodiscard]] constexpr std::pair<int, int> Samples(int num_samples) { + switch (num_samples) { + case 1: + return {1, 1}; + case 2: + return {2, 1}; + case 4: + return {2, 2}; + case 8: + return {4, 2}; + case 16: + return {4, 4}; + } + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return {1, 1}; +} + +[[nodiscard]] constexpr Extent2D DefaultBlockSize(PixelFormat format) { + return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; +} + +[[nodiscard]] constexpr Extent3D NumLevelBlocks(const LevelInfo& info, u32 level) { + return Extent3D{ + .width = AdjustSize(info.size.width, level, info.tile_size.width) << info.bpp_log2, + .height = AdjustSize(info.size.height, level, info.tile_size.height), + .depth = AdjustMipSize(info.size.depth, level), + }; +} + +[[nodiscard]] constexpr Extent3D TileShift(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + return Extent3D{ + .width = AdjustTileSize(info.block.width, GOB_SIZE_X, blocks.width), + .height = AdjustTileSize(info.block.height, GOB_SIZE_Y, blocks.height), + .depth = AdjustTileSize(info.block.depth, GOB_SIZE_Z, blocks.depth), + }; +} + +[[nodiscard]] constexpr Extent2D GobSize(u32 bpp_log2, u32 block_height, u32 tile_width_spacing) { + return Extent2D{ + .width = GOB_SIZE_X_SHIFT - bpp_log2 + tile_width_spacing, + .height = GOB_SIZE_Y_SHIFT + block_height, + }; +} + +[[nodiscard]] constexpr bool IsSmallerThanGobSize(Extent3D num_tiles, Extent2D gob, + u32 block_depth) { + return num_tiles.width <= (1U << gob.width) || num_tiles.height <= (1U << gob.height) || + num_tiles.depth < (1U << block_depth); +} + +[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, Extent2D gob, + u32 bpp_log2) { + if (IsSmallerThanGobSize(num_tiles, gob, block.depth)) { + return GOB_SIZE_X_SHIFT - bpp_log2; + } else { + return gob.width; + } +} + +[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, u32 bpp_log2, + u32 tile_width_spacing) { + const Extent2D gob = GobSize(bpp_log2, block.height, tile_width_spacing); + return StrideAlignment(num_tiles, block, gob, bpp_log2); +} + +[[nodiscard]] constexpr Extent2D NumGobs(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + const Extent2D gobs{ + .width = Common::DivCeilLog2(blocks.width, GOB_SIZE_X_SHIFT), + .height = Common::DivCeilLog2(blocks.height, GOB_SIZE_Y_SHIFT), + }; + const Extent2D gob = GobSize(info.bpp_log2, info.block.height, info.tile_width_spacing); + const bool is_small = IsSmallerThanGobSize(blocks, gob, info.block.depth); + const u32 alignment = is_small ? 0 : info.tile_width_spacing; + return Extent2D{ + .width = Common::AlignUpLog2(gobs.width, alignment), + .height = gobs.height, + }; +} + +[[nodiscard]] constexpr Extent3D LevelTiles(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + const Extent3D tile_shift = TileShift(info, level); + const Extent2D gobs = NumGobs(info, level); + return Extent3D{ + .width = Common::DivCeilLog2(gobs.width, tile_shift.width), + .height = Common::DivCeilLog2(gobs.height, tile_shift.height), + .depth = Common::DivCeilLog2(blocks.depth, tile_shift.depth), + }; +} + +[[nodiscard]] constexpr u32 CalculateLevelSize(const LevelInfo& info, u32 level) { + const Extent3D tile_shift = TileShift(info, level); + const Extent3D tiles = LevelTiles(info, level); + const u32 num_tiles = tiles.width * tiles.height * tiles.depth; + const u32 shift = GOB_SIZE_SHIFT + tile_shift.width + tile_shift.height + tile_shift.depth; + return num_tiles << shift; +} + +[[nodiscard]] constexpr std::array<u32, MAX_MIP_LEVELS> CalculateLevelSizes(const LevelInfo& info, + u32 num_levels) { + ASSERT(num_levels <= MAX_MIP_LEVELS); + std::array<u32, MAX_MIP_LEVELS> sizes{}; + for (u32 level = 0; level < num_levels; ++level) { + sizes[level] = CalculateLevelSize(info, level); + } + return sizes; +} + +[[nodiscard]] constexpr LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block, + u32 num_samples, u32 tile_width_spacing) { + const auto [samples_x, samples_y] = Samples(num_samples); + const u32 bytes_per_block = BytesPerBlock(format); + return { + .size = + { + .width = size.width * samples_x, + .height = size.height * samples_y, + .depth = size.depth, + }, + .block = block, + .tile_size = DefaultBlockSize(format), + .bpp_log2 = BytesPerBlockLog2(bytes_per_block), + .tile_width_spacing = tile_width_spacing, + }; +} + +[[nodiscard]] constexpr LevelInfo MakeLevelInfo(const ImageInfo& info) { + return MakeLevelInfo(info.format, info.size, info.block, info.num_samples, + info.tile_width_spacing); +} + +[[nodiscard]] constexpr u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block, + u32 num_samples, u32 tile_width_spacing, + u32 level) { + const LevelInfo info = MakeLevelInfo(format, size, block, num_samples, tile_width_spacing); + u32 offset = 0; + for (u32 current_level = 0; current_level < level; ++current_level) { + offset += CalculateLevelSize(info, current_level); + } + return offset; +} + +[[nodiscard]] constexpr u32 AlignLayerSize(u32 size_bytes, Extent3D size, Extent3D block, + u32 tile_size_y, u32 tile_width_spacing) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L134 + if (tile_width_spacing > 0) { + const u32 alignment_log2 = GOB_SIZE_SHIFT + tile_width_spacing + block.height + block.depth; + return Common::AlignUpLog2(size_bytes, alignment_log2); + } + const u32 aligned_height = Common::AlignUp(size.height, tile_size_y); + while (block.height != 0 && aligned_height <= (1U << (block.height - 1)) * GOB_SIZE_Y) { + --block.height; + } + while (block.depth != 0 && size.depth <= (1U << (block.depth - 1))) { + --block.depth; + } + const u32 block_shift = GOB_SIZE_SHIFT + block.height + block.depth; + const u32 num_blocks = size_bytes >> block_shift; + if (size_bytes != num_blocks << block_shift) { + return (num_blocks + 1) << block_shift; + } + return size_bytes; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapEqualAddress(const ImageInfo& new_info, + const ImageBase& overlap, + bool strict_size) { + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, 0, 0, strict_size)) { + return std::nullopt; + } + if (new_info.block != info.block) { + return std::nullopt; + } + const SubresourceExtent resources = new_info.resources; + return SubresourceExtent{ + .levels = std::max(resources.levels, info.resources.levels), + .layers = std::max(resources.layers, info.resources.layers), + }; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress3D( + const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { + const std::vector<u32> slice_offsets = CalculateSliceOffsets(new_info); + const u32 diff = static_cast<u32>(overlap.gpu_addr - gpu_addr); + const auto it = std::ranges::find(slice_offsets, diff); + if (it == slice_offsets.end()) { + return std::nullopt; + } + const std::vector subresources = CalculateSliceSubresources(new_info); + const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)]; + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { + return std::nullopt; + } + const u32 mip_depth = std::max(1U, new_info.size.depth << base.level); + if (mip_depth < info.size.depth + base.layer) { + return std::nullopt; + } + if (MipBlockSize(new_info, base.level) != info.block) { + return std::nullopt; + } + return SubresourceExtent{ + .levels = std::max(new_info.resources.levels, info.resources.levels + base.level), + .layers = 1, + }; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress2D( + const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { + const u32 layer_stride = new_info.layer_stride; + const s32 new_size = layer_stride * new_info.resources.layers; + const s32 diff = static_cast<s32>(overlap.gpu_addr - gpu_addr); + if (diff > new_size) { + return std::nullopt; + } + const s32 base_layer = diff / layer_stride; + const s32 mip_offset = diff % layer_stride; + const std::array offsets = CalculateMipLevelOffsets(new_info); + const auto end = offsets.begin() + new_info.resources.levels; + const auto it = std::find(offsets.begin(), end, mip_offset); + if (it == end) { + // Mipmap is not aligned to any valid size + return std::nullopt; + } + const SubresourceBase base{ + .level = static_cast<s32>(std::distance(offsets.begin(), it)), + .layer = base_layer, + }; + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { + return std::nullopt; + } + if (MipBlockSize(new_info, base.level) != info.block) { + return std::nullopt; + } + return SubresourceExtent{ + .levels = std::max(new_info.resources.levels, info.resources.levels + base.level), + .layers = std::max(new_info.resources.layers, info.resources.layers + base.layer), + }; +} + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlapRightAddress(const ImageInfo& new_info, + GPUVAddr gpu_addr, + VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size) { + std::optional<SubresourceExtent> resources; + if (new_info.type != ImageType::e3D) { + resources = ResolveOverlapRightAddress2D(new_info, gpu_addr, overlap, strict_size); + } else { + resources = ResolveOverlapRightAddress3D(new_info, gpu_addr, overlap, strict_size); + } + if (!resources) { + return std::nullopt; + } + return OverlapResult{ + .gpu_addr = gpu_addr, + .cpu_addr = cpu_addr, + .resources = *resources, + }; +} + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlapLeftAddress(const ImageInfo& new_info, + GPUVAddr gpu_addr, + VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size) { + const std::optional<SubresourceBase> base = overlap.TryFindBase(gpu_addr); + if (!base) { + return std::nullopt; + } + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base->level, 0, strict_size)) { + return std::nullopt; + } + if (new_info.block != MipBlockSize(info, base->level)) { + return std::nullopt; + } + const SubresourceExtent resources = new_info.resources; + s32 layers = 1; + if (info.type != ImageType::e3D) { + layers = std::max(resources.layers, info.resources.layers + base->layer); + } + return OverlapResult{ + .gpu_addr = overlap.gpu_addr, + .cpu_addr = overlap.cpu_addr, + .resources = + { + .levels = std::max(resources.levels + base->level, info.resources.levels), + .layers = layers, + }, + }; +} + +[[nodiscard]] Extent2D PitchLinearAlignedSize(const ImageInfo& info) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L212 + static constexpr u32 STRIDE_ALIGNMENT = 32; + ASSERT(info.type == ImageType::Linear); + const Extent2D num_tiles{ + .width = Common::DivCeil(info.size.width, DefaultBlockWidth(info.format)), + .height = Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)), + }; + const u32 width_alignment = STRIDE_ALIGNMENT / BytesPerBlock(info.format); + return Extent2D{ + .width = Common::AlignUp(num_tiles.width, width_alignment), + .height = num_tiles.height, + }; +} + +[[nodiscard]] Extent3D BlockLinearAlignedSize(const ImageInfo& info, u32 level) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L176 + ASSERT(info.type != ImageType::Linear); + const Extent3D size = AdjustMipSize(info.size, level); + const Extent3D num_tiles{ + .width = Common::DivCeil(size.width, DefaultBlockWidth(info.format)), + .height = Common::DivCeil(size.height, DefaultBlockHeight(info.format)), + .depth = size.depth, + }; + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + const u32 alignment = StrideAlignment(num_tiles, info.block, bpp_log2, info.tile_width_spacing); + const Extent3D mip_block = AdjustMipBlockSize(num_tiles, info.block, 0); + return Extent3D{ + .width = Common::AlignUpLog2(num_tiles.width, alignment), + .height = Common::AlignUpLog2(num_tiles.height, GOB_SIZE_Y_SHIFT + mip_block.height), + .depth = Common::AlignUpLog2(num_tiles.depth, GOB_SIZE_Z_SHIFT + mip_block.depth), + }; +} + +[[nodiscard]] constexpr u32 NumBlocksPerLayer(const ImageInfo& info, Extent2D tile_size) noexcept { + u32 num_blocks = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + const Extent3D mip_size = AdjustMipSize(info.size, level); + num_blocks += NumBlocks(mip_size, tile_size); + } + return num_blocks; +} + +[[nodiscard]] u32 NumSlices(const ImageInfo& info) noexcept { + ASSERT(info.type == ImageType::e3D); + u32 num_slices = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + num_slices += AdjustMipSize(info.size.depth, level); + } + return num_slices; +} + +void SwizzlePitchLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, const BufferImageCopy& copy, + std::span<const u8> memory) { + ASSERT(copy.image_offset.z == 0); + ASSERT(copy.image_extent.depth == 1); + ASSERT(copy.image_subresource.base_level == 0); + ASSERT(copy.image_subresource.base_layer == 0); + ASSERT(copy.image_subresource.num_layers == 1); + + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 row_length = copy.image_extent.width * bytes_per_block; + const u32 guest_offset_x = copy.image_offset.x * bytes_per_block; + + for (u32 line = 0; line < copy.image_extent.height; ++line) { + const u32 host_offset_y = line * info.pitch; + const u32 guest_offset_y = (copy.image_offset.y + line) * info.pitch; + const u32 guest_offset = guest_offset_x + guest_offset_y; + gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, memory.data() + host_offset_y, + row_length); + } +} + +void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, const BufferImageCopy& copy, + std::span<const u8> input) { + const Extent3D size = info.size; + const LevelInfo level_info = MakeLevelInfo(info); + const Extent2D tile_size = DefaultBlockSize(info.format); + const u32 bytes_per_block = BytesPerBlock(info.format); + + const s32 level = copy.image_subresource.base_level; + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block; + + UNIMPLEMENTED_IF(info.tile_width_spacing > 0); + + UNIMPLEMENTED_IF(copy.image_offset.x != 0); + UNIMPLEMENTED_IF(copy.image_offset.y != 0); + UNIMPLEMENTED_IF(copy.image_offset.z != 0); + UNIMPLEMENTED_IF(copy.image_extent != level_size); + + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + + size_t host_offset = copy.buffer_offset; + + const u32 num_levels = info.resources.levels; + const std::array sizes = CalculateLevelSizes(level_info, num_levels); + size_t guest_offset = std::reduce(sizes.begin(), sizes.begin() + level, 0); + const size_t layer_stride = + AlignLayerSize(std::reduce(sizes.begin(), sizes.begin() + num_levels, 0), size, + level_info.block, tile_size.height, info.tile_width_spacing); + const size_t subresource_size = sizes[level]; + + const auto dst_data = std::make_unique<u8[]>(subresource_size); + const std::span<u8> dst(dst_data.get(), subresource_size); + + for (s32 layer = 0; layer < info.resources.layers; ++layer) { + const std::span<const u8> src = input.subspan(host_offset); + SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, + num_tiles.depth, block.height, block.depth); + + gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); + + host_offset += host_bytes_per_layer; + guest_offset += layer_stride; + } + ASSERT(host_offset - copy.buffer_offset == copy.buffer_size); +} + +} // Anonymous namespace + +u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + if (info.type == ImageType::Linear) { + return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)); + } + if (info.resources.layers > 1) { + ASSERT(info.layer_stride != 0); + return info.layer_stride * info.resources.layers; + } else { + return CalculateLayerSize(info); + } +} + +u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + if (info.num_samples > 1) { + // Multisample images can't be uploaded or downloaded to the host + return 0; + } + if (info.type == ImageType::Linear) { + return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)); + } + const Extent2D tile_size = DefaultBlockSize(info.format); + return NumBlocksPerLayer(info, tile_size) * info.resources.layers * BytesPerBlock(info.format); +} + +u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + static constexpr Extent2D TILE_SIZE{1, 1}; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; +} + +u32 CalculateLayerStride(const ImageInfo& info) noexcept { + ASSERT(info.type != ImageType::Linear); + const u32 layer_size = CalculateLayerSize(info); + const Extent3D size = info.size; + const Extent3D block = info.block; + const u32 tile_size_y = DefaultBlockHeight(info.format); + return AlignLayerSize(layer_size, size, block, tile_size_y, info.tile_width_spacing); +} + +u32 CalculateLayerSize(const ImageInfo& info) noexcept { + ASSERT(info.type != ImageType::Linear); + return CalculateLevelOffset(info.format, info.size, info.block, info.num_samples, + info.tile_width_spacing, info.resources.levels); +} + +std::array<u32, MAX_MIP_LEVELS> CalculateMipLevelOffsets(const ImageInfo& info) noexcept { + ASSERT(info.resources.levels <= static_cast<s32>(MAX_MIP_LEVELS)); + const LevelInfo level_info = MakeLevelInfo(info); + std::array<u32, MAX_MIP_LEVELS> offsets{}; + u32 offset = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + offsets[level] = offset; + offset += CalculateLevelSize(level_info, level); + } + return offsets; +} + +std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) { + ASSERT(info.type == ImageType::e3D); + std::vector<u32> offsets; + offsets.reserve(NumSlices(info)); + + const LevelInfo level_info = MakeLevelInfo(info); + u32 mip_offset = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + const Extent3D tile_shift = TileShift(level_info, level); + const Extent3D tiles = LevelTiles(level_info, level); + const u32 gob_size_shift = tile_shift.height + GOB_SIZE_SHIFT; + const u32 slice_size = (tiles.width * tiles.height) << gob_size_shift; + const u32 z_mask = (1U << tile_shift.depth) - 1; + const u32 depth = AdjustMipSize(info.size.depth, level); + for (u32 slice = 0; slice < depth; ++slice) { + const u32 z_low = slice & z_mask; + const u32 z_high = slice & ~z_mask; + offsets.push_back(mip_offset + (z_low << gob_size_shift) + (z_high * slice_size)); + } + mip_offset += CalculateLevelSize(level_info, level); + } + return offsets; +} + +std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info) { + ASSERT(info.type == ImageType::e3D); + std::vector<SubresourceBase> subresources; + subresources.reserve(NumSlices(info)); + for (s32 level = 0; level < info.resources.levels; ++level) { + const s32 depth = AdjustMipSize(info.size.depth, level); + for (s32 slice = 0; slice < depth; ++slice) { + subresources.emplace_back(SubresourceBase{ + .level = level, + .layer = slice, + }); + } + } + return subresources; +} + +u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level) { + const Extent2D tile_size = DefaultBlockSize(info.format); + const Extent3D level_size = AdjustMipSize(info.size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, info.block, level); + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + return StrideAlignment(num_tiles, block, bpp_log2, info.tile_width_spacing); +} + +PixelFormat PixelFormatFromTIC(const TICEntry& config) noexcept { + return PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, + config.a_type, config.srgb_conversion); +} + +ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { + switch (info.type) { + case ImageType::e2D: + return info.resources.layers > 1 ? ImageViewType::e2DArray : ImageViewType::e2D; + case ImageType::e3D: + return ImageViewType::e2DArray; + case ImageType::Linear: + return ImageViewType::e2D; + default: + UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(info.type)); + return ImageViewType{}; + } +} + +std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, + SubresourceBase base) { + ASSERT(dst.resources.levels >= src.resources.levels); + ASSERT(dst.num_samples == src.num_samples); + + const bool is_dst_3d = dst.type == ImageType::e3D; + if (is_dst_3d) { + ASSERT(src.type == ImageType::e3D); + ASSERT(src.resources.levels == 1); + } + + std::vector<ImageCopy> copies; + copies.reserve(src.resources.levels); + for (s32 level = 0; level < src.resources.levels; ++level) { + ImageCopy& copy = copies.emplace_back(); + copy.src_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.dst_subresource = SubresourceLayers{ + .base_level = base.level + level, + .base_layer = is_dst_3d ? 0 : base.layer, + .num_layers = is_dst_3d ? 1 : src.resources.layers, + }; + copy.src_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + copy.dst_offset = Offset3D{ + .x = 0, + .y = 0, + .z = is_dst_3d ? base.layer : 0, + }; + const Extent3D mip_size = AdjustMipSize(dst.size, base.level + level); + copy.extent = AdjustSamplesSize(mip_size, dst.num_samples); + if (is_dst_3d) { + copy.extent.depth = src.size.depth; + } + } + return copies; +} + +bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { + if (config.Address() == 0) { + return false; + } + if (config.Address() > (u64(1) << 48)) { + return false; + } + return gpu_memory.GpuToCpuAddress(config.Address()).has_value(); +} + +std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, std::span<u8> output) { + const size_t guest_size_bytes = CalculateGuestSizeInBytes(info); + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + const Extent3D size = info.size; + + if (info.type == ImageType::Linear) { + gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), guest_size_bytes); + + ASSERT((info.pitch >> bpp_log2) << bpp_log2 == info.pitch); + return {{ + .buffer_offset = 0, + .buffer_size = guest_size_bytes, + .buffer_row_length = info.pitch >> bpp_log2, + .buffer_image_height = size.height, + .image_subresource = + { + .base_level = 0, + .base_layer = 0, + .num_layers = 1, + }, + .image_offset = {0, 0, 0}, + .image_extent = size, + }}; + } + const auto input_data = std::make_unique<u8[]>(guest_size_bytes); + gpu_memory.ReadBlockUnsafe(gpu_addr, input_data.get(), guest_size_bytes); + const std::span<const u8> input(input_data.get(), guest_size_bytes); + + const LevelInfo level_info = MakeLevelInfo(info); + const s32 num_layers = info.resources.layers; + const s32 num_levels = info.resources.levels; + const Extent2D tile_size = DefaultBlockSize(info.format); + const std::array level_sizes = CalculateLevelSizes(level_info, num_levels); + const Extent2D gob = GobSize(bpp_log2, info.block.height, info.tile_width_spacing); + const u32 layer_size = std::reduce(level_sizes.begin(), level_sizes.begin() + num_levels, 0); + const u32 layer_stride = AlignLayerSize(layer_size, size, level_info.block, tile_size.height, + info.tile_width_spacing); + size_t guest_offset = 0; + u32 host_offset = 0; + std::vector<BufferImageCopy> copies(num_levels); + + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_layer = num_blocks_per_layer << bpp_log2; + copies[level] = BufferImageCopy{ + .buffer_offset = host_offset, + .buffer_size = static_cast<size_t>(host_bytes_per_layer) * num_layers, + .buffer_row_length = Common::AlignUp(level_size.width, tile_size.width), + .buffer_image_height = Common::AlignUp(level_size.height, tile_size.height), + .image_subresource = + { + .base_level = level, + .base_layer = 0, + .num_layers = info.resources.layers, + }, + .image_offset = {0, 0, 0}, + .image_extent = level_size, + }; + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + const u32 stride_alignment = StrideAlignment(num_tiles, info.block, gob, bpp_log2); + size_t guest_layer_offset = 0; + + for (s32 layer = 0; layer < info.resources.layers; ++layer) { + const std::span<u8> dst = output.subspan(host_offset); + const std::span<const u8> src = input.subspan(guest_offset + guest_layer_offset); + UnswizzleTexture(dst, src, 1U << bpp_log2, num_tiles.width, num_tiles.height, + num_tiles.depth, block.height, block.depth, stride_alignment); + guest_layer_offset += layer_stride; + host_offset += host_bytes_per_layer; + } + guest_offset += level_sizes[level]; + } + return copies; +} + +BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageBase& image, std::span<u8> output) { + gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), image.guest_size_bytes); + return BufferCopy{ + .src_offset = 0, + .dst_offset = 0, + .size = image.guest_size_bytes, + }; +} + +void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, + std::span<BufferImageCopy> copies) { + u32 output_offset = 0; + + const Extent2D tile_size = DefaultBlockSize(info.format); + for (BufferImageCopy& copy : copies) { + const u32 level = copy.image_subresource.base_level; + const Extent3D mip_size = AdjustMipSize(info.size, level); + ASSERT(copy.image_offset == Offset3D{}); + ASSERT(copy.image_subresource.base_layer == 0); + ASSERT(copy.image_extent == mip_size); + ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); + ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); + + if (IsPixelFormatASTC(info.format)) { + ASSERT(copy.image_extent.depth == 1); + Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset), + copy.image_extent.width, copy.image_extent.height, + copy.image_subresource.num_layers, tile_size.width, + tile_size.height, output.subspan(output_offset)); + } else { + DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, + output.subspan(output_offset)); + } + copy.buffer_offset = output_offset; + copy.buffer_row_length = mip_size.width; + copy.buffer_image_height = mip_size.height; + + output_offset += copy.image_extent.width * copy.image_extent.height * + copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + } +} + +std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) { + const Extent3D size = info.size; + const u32 bytes_per_block = BytesPerBlock(info.format); + if (info.type == ImageType::Linear) { + ASSERT(info.pitch % bytes_per_block == 0); + return {{ + .buffer_offset = 0, + .buffer_size = static_cast<size_t>(info.pitch) * size.height, + .buffer_row_length = info.pitch / bytes_per_block, + .buffer_image_height = size.height, + .image_subresource = + { + .base_level = 0, + .base_layer = 0, + .num_layers = 1, + }, + .image_offset = {0, 0, 0}, + .image_extent = size, + }}; + } + UNIMPLEMENTED_IF(info.tile_width_spacing > 0); + + const s32 num_layers = info.resources.layers; + const s32 num_levels = info.resources.levels; + const Extent2D tile_size = DefaultBlockSize(info.format); + + u32 host_offset = 0; + + std::vector<BufferImageCopy> copies(num_levels); + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_level = num_blocks_per_layer * bytes_per_block * num_layers; + copies[level] = BufferImageCopy{ + .buffer_offset = host_offset, + .buffer_size = host_bytes_per_level, + .buffer_row_length = level_size.width, + .buffer_image_height = level_size.height, + .image_subresource = + { + .base_level = level, + .base_layer = 0, + .num_layers = info.resources.layers, + }, + .image_offset = {0, 0, 0}, + .image_extent = level_size, + }; + host_offset += host_bytes_per_level; + } + return copies; +} + +Extent3D MipSize(Extent3D size, u32 level) { + return AdjustMipSize(size, level); +} + +Extent3D MipBlockSize(const ImageInfo& info, u32 level) { + const LevelInfo level_info = MakeLevelInfo(info); + const Extent2D tile_size = DefaultBlockSize(info.format); + const Extent3D level_size = AdjustMipSize(info.size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + return AdjustMipBlockSize(num_tiles, level_info.block, level); +} + +std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) { + const Extent2D tile_size = DefaultBlockSize(info.format); + if (info.type == ImageType::Linear) { + return std::vector{SwizzleParameters{ + .num_tiles = AdjustTileSize(info.size, tile_size), + .block = {}, + .buffer_offset = 0, + .level = 0, + }}; + } + const LevelInfo level_info = MakeLevelInfo(info); + const Extent3D size = info.size; + const s32 num_levels = info.resources.levels; + + u32 guest_offset = 0; + std::vector<SwizzleParameters> params(num_levels); + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + params[level] = SwizzleParameters{ + .num_tiles = num_tiles, + .block = block, + .buffer_offset = guest_offset, + .level = level, + }; + guest_offset += CalculateLevelSize(level_info, level); + } + return params; +} + +void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span<const BufferImageCopy> copies, std::span<const u8> memory) { + const bool is_pitch_linear = info.type == ImageType::Linear; + for (const BufferImageCopy& copy : copies) { + if (is_pitch_linear) { + SwizzlePitchLinearImage(gpu_memory, gpu_addr, info, copy, memory); + } else { + SwizzleBlockLinearImage(gpu_memory, gpu_addr, info, copy, memory); + } + } +} + +bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32 lhs_level, + u32 rhs_level, bool strict_size) noexcept { + ASSERT(lhs.type != ImageType::Linear); + ASSERT(rhs.type != ImageType::Linear); + if (strict_size) { + const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level); + const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level); + return lhs_size.width == rhs_size.width && lhs_size.height == rhs_size.height; + } else { + const Extent3D lhs_size = BlockLinearAlignedSize(lhs, lhs_level); + const Extent3D rhs_size = BlockLinearAlignedSize(rhs, rhs_level); + return lhs_size.width == rhs_size.width && lhs_size.height == rhs_size.height; + } +} + +bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept { + ASSERT(lhs.type == ImageType::Linear); + ASSERT(rhs.type == ImageType::Linear); + if (strict_size) { + return lhs.size.width == rhs.size.width && lhs.size.height == rhs.size.height; + } else { + const Extent2D lhs_size = PitchLinearAlignedSize(lhs); + const Extent2D rhs_size = PitchLinearAlignedSize(rhs); + return lhs_size == rhs_size; + } +} + +std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, GPUVAddr gpu_addr, + VAddr cpu_addr, const ImageBase& overlap, + bool strict_size, bool broken_views) { + ASSERT(new_info.type != ImageType::Linear); + ASSERT(overlap.info.type != ImageType::Linear); + if (!IsLayerStrideCompatible(new_info, overlap.info)) { + return std::nullopt; + } + if (!IsViewCompatible(overlap.info.format, new_info.format, broken_views)) { + return std::nullopt; + } + if (gpu_addr == overlap.gpu_addr) { + const std::optional solution = ResolveOverlapEqualAddress(new_info, overlap, strict_size); + if (!solution) { + return std::nullopt; + } + return OverlapResult{ + .gpu_addr = gpu_addr, + .cpu_addr = cpu_addr, + .resources = *solution, + }; + } + if (overlap.gpu_addr > gpu_addr) { + return ResolveOverlapRightAddress(new_info, gpu_addr, cpu_addr, overlap, strict_size); + } + // if overlap.gpu_addr < gpu_addr + return ResolveOverlapLeftAddress(new_info, gpu_addr, cpu_addr, overlap, strict_size); +} + +bool IsLayerStrideCompatible(const ImageInfo& lhs, const ImageInfo& rhs) { + // If either of the layer strides is zero, we can assume they are compatible + // These images generally come from rendertargets + if (lhs.layer_stride == 0) { + return true; + } + if (rhs.layer_stride == 0) { + return true; + } + // It's definitely compatible if the layer stride matches + if (lhs.layer_stride == rhs.layer_stride) { + return true; + } + // Although we also have to compare for cases where it can be unaligned + // This can happen if the image doesn't have layers, so the stride is not aligned + if (lhs.maybe_unaligned_layer_stride == rhs.maybe_unaligned_layer_stride) { + return true; + } + return false; +} + +std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr, RelaxedOptions options, + bool broken_views) { + const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr); + if (!base) { + return std::nullopt; + } + const ImageInfo& existing = image.info; + if (False(options & RelaxedOptions::Format)) { + if (!IsViewCompatible(existing.format, candidate.format, broken_views)) { + return std::nullopt; + } + } + if (!IsLayerStrideCompatible(existing, candidate)) { + return std::nullopt; + } + if (existing.type != candidate.type) { + return std::nullopt; + } + if (False(options & RelaxedOptions::Samples)) { + if (existing.num_samples != candidate.num_samples) { + return std::nullopt; + } + } + if (existing.resources.levels < candidate.resources.levels + base->level) { + return std::nullopt; + } + if (existing.type == ImageType::e3D) { + const u32 mip_depth = std::max(1U, existing.size.depth << base->level); + if (mip_depth < candidate.size.depth + base->layer) { + return std::nullopt; + } + } else { + if (existing.resources.layers < candidate.resources.layers + base->layer) { + return std::nullopt; + } + } + const bool strict_size = False(options & RelaxedOptions::Size); + if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { + return std::nullopt; + } + // TODO: compare block sizes + return base; +} + +bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr, + RelaxedOptions options, bool broken_views) { + return FindSubresource(candidate, image, candidate_addr, options, broken_views).has_value(); +} + +void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, + const ImageBase* src) { + if (src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { + src_info.format = src->info.format; + } + if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { + dst_info.format = dst->info.format; + } + if (!dst && src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { + dst_info.format = src->info.format; + } + if (!src && dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { + src_info.format = src->info.format; + } +} + +u32 MapSizeBytes(const ImageBase& image) { + if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { + return image.guest_size_bytes; + } else if (True(image.flags & ImageFlagBits::Converted)) { + return image.converted_size_bytes; + } else { + return image.unswizzled_size_bytes; + } +} + +static_assert(CalculateLevelSize(LevelInfo{{1920, 1080, 1}, {0, 2, 0}, {1, 1}, 2, 0}, 0) == + 0x7f8000); +static_assert(CalculateLevelSize(LevelInfo{{32, 32, 1}, {0, 0, 4}, {1, 1}, 4, 0}, 0) == 0x4000); + +static_assert(CalculateLevelOffset(PixelFormat::R8_SINT, {1920, 1080, 1}, {0, 2, 0}, 1, 0, 7) == + 0x2afc00); +static_assert(CalculateLevelOffset(PixelFormat::ASTC_2D_12X12_UNORM, {8192, 4096, 1}, {0, 2, 0}, 1, + 0, 12) == 0x50d200); + +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 0) == 0); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 1) == 0x400000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 2) == 0x500000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 3) == 0x540000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 4) == 0x550000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 5) == 0x554000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 6) == 0x555000); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 7) == 0x555400); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 8) == 0x555600); +static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 1, 0, + 9) == 0x555800); + +constexpr u32 ValidateLayerSize(PixelFormat format, u32 width, u32 height, u32 block_height, + u32 tile_width_spacing, u32 level) { + const Extent3D size{width, height, 1}; + const Extent3D block{0, block_height, 0}; + const u32 offset = CalculateLevelOffset(format, size, block, 1, tile_width_spacing, level); + return AlignLayerSize(offset, size, block, DefaultBlockHeight(format), tile_width_spacing); +} + +static_assert(ValidateLayerSize(PixelFormat::ASTC_2D_12X12_UNORM, 8192, 4096, 2, 0, 12) == + 0x50d800); +static_assert(ValidateLayerSize(PixelFormat::A8B8G8R8_UNORM, 1024, 1024, 2, 0, 10) == 0x556000); +static_assert(ValidateLayerSize(PixelFormat::BC3_UNORM, 128, 128, 2, 0, 8) == 0x6000); + +static_assert(ValidateLayerSize(PixelFormat::A8B8G8R8_UNORM, 518, 572, 4, 3, 1) == 0x190000, + "Tile width spacing is not working"); +static_assert(ValidateLayerSize(PixelFormat::BC5_UNORM, 1024, 1024, 3, 4, 11) == 0x160000, + "Compressed tile width spacing is not working"); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h new file mode 100644 index 000000000..52a9207d6 --- /dev/null +++ b/src/video_core/texture_cache/util.h @@ -0,0 +1,109 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <optional> +#include <span> + +#include "common/common_types.h" + +#include "video_core/engines/maxwell_3d.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::TICEntry; + +struct OverlapResult { + GPUVAddr gpu_addr; + VAddr cpu_addr; + SubresourceExtent resources; +}; + +[[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateLayerStride(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateLayerSize(const ImageInfo& info) noexcept; + +[[nodiscard]] std::array<u32, MAX_MIP_LEVELS> CalculateMipLevelOffsets( + const ImageInfo& info) noexcept; + +[[nodiscard]] std::vector<u32> CalculateSliceOffsets(const ImageInfo& info); + +[[nodiscard]] std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info); + +[[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level); + +[[nodiscard]] VideoCore::Surface::PixelFormat PixelFormatFromTIC( + const Tegra::Texture::TICEntry& config) noexcept; + +[[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept; + +[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, + const ImageInfo& src, + SubresourceBase base); + +[[nodiscard]] bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); + +[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, + GPUVAddr gpu_addr, const ImageInfo& info, + std::span<u8> output); + +[[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageBase& image, std::span<u8> output); + +void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, + std::span<BufferImageCopy> copies); + +[[nodiscard]] std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info); + +[[nodiscard]] Extent3D MipSize(Extent3D size, u32 level); + +[[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level); + +[[nodiscard]] std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info); + +void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span<const BufferImageCopy> copies, std::span<const u8> memory); + +[[nodiscard]] bool IsBlockLinearSizeCompatible(const ImageInfo& new_info, + const ImageInfo& overlap_info, u32 new_level, + u32 overlap_level, bool strict_size) noexcept; + +[[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, + bool strict_size) noexcept; + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, + GPUVAddr gpu_addr, VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size, bool broken_views); + +[[nodiscard]] bool IsLayerStrideCompatible(const ImageInfo& lhs, const ImageInfo& rhs); + +[[nodiscard]] std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, + const ImageBase& image, + GPUVAddr candidate_addr, + RelaxedOptions options, + bool broken_views); + +[[nodiscard]] bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr, RelaxedOptions options, + bool broken_views); + +void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, + const ImageBase* src); + +[[nodiscard]] u32 MapSizeBytes(const ImageBase& image); + +} // namespace VideoCommon diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 365bde2f1..3625b666c 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -18,6 +18,7 @@ #include <algorithm> #include <cassert> #include <cstring> +#include <span> #include <vector> #include <boost/container/static_vector.hpp> @@ -41,21 +42,24 @@ constexpr u32 Popcnt(u32 n) { class InputBitStream { public: - constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) - : cur_byte{ptr}, next_bit{start_offset % 8} {} + constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0) + : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {} - constexpr std::size_t GetBitsRead() const { + constexpr size_t GetBitsRead() const { return bits_read; } constexpr bool ReadBit() { - const bool bit = (*cur_byte >> next_bit++) & 1; + if (bits_read >= total_bits * 8) { + return 0; + } + const bool bit = ((*cur_byte >> next_bit) & 1) != 0; + ++next_bit; while (next_bit >= 8) { next_bit -= 8; - cur_byte++; + ++cur_byte; } - - bits_read++; + ++bits_read; return bit; } @@ -78,8 +82,9 @@ public: private: const u8* cur_byte; - std::size_t next_bit = 0; - std::size_t bits_read = 0; + size_t total_bits = 0; + size_t next_bit = 0; + size_t bits_read = 0; }; class OutputBitStream { @@ -192,15 +197,15 @@ struct IntegerEncodedValue { }; }; using IntegerEncodedVector = boost::container::static_vector< - IntegerEncodedValue, 64, + IntegerEncodedValue, 256, boost::container::static_vector_options< boost::container::inplace_alignment<alignof(IntegerEncodedValue)>, boost::container::throw_on_overflow<false>>::type>; static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { // Implement the algorithm in section C.2.12 - u32 m[5]; - u32 t[5]; + std::array<u32, 5> m; + std::array<u32, 5> t; u32 T; // Read the trit encoded block according to @@ -600,7 +605,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { return params; } -static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 blockWidth, +static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { // Don't actually care about the void extent, just read the bits... for (s32 i = 0; i < 4; ++i) { @@ -623,7 +628,7 @@ static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 block } } -static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { +static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { for (u32 j = 0; j < blockHeight; j++) { for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = 0xFFFF00FF; @@ -865,7 +870,7 @@ public: } }; -static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nPartitions, +static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions, const u32 nBitsForColorData) { // First figure out how many color values we have u32 nValues = 0; @@ -897,7 +902,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP // We now have enough to decode our integer sequence. IntegerEncodedVector decodedColorValues; - InputBitStream colorStream(data); + InputBitStream colorStream(data, 0); DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); // Once we have the decoded values, we need to dequantize them to the 0-255 range @@ -1438,8 +1443,8 @@ static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, #undef READ_INT_VALUES } -static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 blockHeight, - u32* outBuf) { +static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, + const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { InputBitStream strm(inBuf); TexelWeightParams weightParams = DecodeBlockInfo(strm); @@ -1601,8 +1606,8 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 } // Read the texel weight data.. - u8 texelWeightData[16]; - memcpy(texelWeightData, inBuf, sizeof(texelWeightData)); + std::array<u8, 16> texelWeightData; + std::ranges::copy(inBuf, texelWeightData.begin()); // Reverse everything for (u32 i = 0; i < 8; i++) { @@ -1618,10 +1623,12 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 // Make sure that higher non-texel bits are set to zero const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; - texelWeightData[clearByteStart - 1] = - texelWeightData[clearByteStart - 1] & - static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); - memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); + if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) { + texelWeightData[clearByteStart - 1] &= + static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); + std::memset(texelWeightData.data() + clearByteStart, 0, + std::min(16U - clearByteStart, 16U)); + } IntegerEncodedVector texelWeightValues; @@ -1672,36 +1679,32 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 namespace Tegra::Texture::ASTC { -std::vector<u8> Decompress(const u8* data, u32 width, u32 height, u32 depth, u32 block_width, - u32 block_height) { - u32 blockIdx = 0; +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) { + u32 block_index = 0; std::size_t depth_offset = 0; - std::vector<u8> outData(height * width * depth * 4); - for (u32 k = 0; k < depth; k++) { - for (u32 j = 0; j < height; j += block_height) { - for (u32 i = 0; i < width; i += block_width) { - - const u8* blockPtr = data + blockIdx * 16; + for (u32 z = 0; z < depth; z++) { + for (u32 y = 0; y < height; y += block_height) { + for (u32 x = 0; x < width; x += block_width) { + const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)}; // Blocks can be at most 12x12 - u32 uncompData[144]; + std::array<u32, 12 * 12> uncompData; ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - u32 decompWidth = std::min(block_width, width - i); - u32 decompHeight = std::min(block_height, height - j); + u32 decompWidth = std::min(block_width, width - x); + u32 decompHeight = std::min(block_height, height - y); - u8* outRow = depth_offset + outData.data() + (j * width + i) * 4; + const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4); for (u32 jj = 0; jj < decompHeight; jj++) { - memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); + std::memcpy(outRow.data() + jj * width * 4, + uncompData.data() + jj * block_width, decompWidth * 4); } - - blockIdx++; + ++block_index; } } depth_offset += height * width * 4; } - - return outData; } } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 991cdba72..9105119bc 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -5,11 +5,10 @@ #pragma once #include <cstdint> -#include <vector> namespace Tegra::Texture::ASTC { -std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, - uint32_t depth, uint32_t block_width, uint32_t block_height); +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp deleted file mode 100644 index 962921483..000000000 --- a/src/video_core/textures/convert.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <cstring> -#include <tuple> -#include <vector> - -#include "common/assert.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "video_core/surface.h" -#include "video_core/textures/astc.h" -#include "video_core/textures/convert.h" - -namespace Tegra::Texture { - -using VideoCore::Surface::PixelFormat; - -template <bool reverse> -void SwapS8Z24ToZ24S8(u8* data, u32 width, u32 height) { - union S8Z24 { - BitField<0, 24, u32> z24; - BitField<24, 8, u32> s8; - }; - static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size"); - - union Z24S8 { - BitField<0, 8, u32> s8; - BitField<8, 24, u32> z24; - }; - static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size"); - - S8Z24 s8z24_pixel{}; - Z24S8 z24s8_pixel{}; - constexpr auto bpp{ - VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM)}; - for (std::size_t y = 0; y < height; ++y) { - for (std::size_t x = 0; x < width; ++x) { - const std::size_t offset{bpp * (y * width + x)}; - if constexpr (reverse) { - std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8)); - s8z24_pixel.s8.Assign(z24s8_pixel.s8); - s8z24_pixel.z24.Assign(z24s8_pixel.z24); - std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24)); - } else { - std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24)); - z24s8_pixel.s8.Assign(s8z24_pixel.s8); - z24s8_pixel.z24.Assign(s8z24_pixel.z24); - std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8)); - } - } - } -} - -static void ConvertS8Z24ToZ24S8(u8* data, u32 width, u32 height) { - SwapS8Z24ToZ24S8<false>(data, width, height); -} - -static void ConvertZ24S8ToS8Z24(u8* data, u32 width, u32 height) { - SwapS8Z24ToZ24S8<true>(data, width, height); -} - -void ConvertFromGuestToHost(u8* in_data, u8* out_data, PixelFormat pixel_format, u32 width, - u32 height, u32 depth, bool convert_astc, bool convert_s8z24) { - if (convert_astc && IsPixelFormatASTC(pixel_format)) { - // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC. - u32 block_width{}; - u32 block_height{}; - std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format); - const std::vector<u8> rgba8_data = Tegra::Texture::ASTC::Decompress( - in_data, width, height, depth, block_width, block_height); - std::copy(rgba8_data.begin(), rgba8_data.end(), out_data); - - } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { - Tegra::Texture::ConvertS8Z24ToZ24S8(in_data, width, height); - } -} - -void ConvertFromHostToGuest(u8* data, PixelFormat pixel_format, u32 width, u32 height, u32 depth, - bool convert_astc, bool convert_s8z24) { - if (convert_astc && IsPixelFormatASTC(pixel_format)) { - LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented", - static_cast<u32>(pixel_format)); - UNREACHABLE(); - - } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { - Tegra::Texture::ConvertZ24S8ToS8Z24(data, width, height); - } -} - -} // namespace Tegra::Texture diff --git a/src/video_core/textures/convert.h b/src/video_core/textures/convert.h deleted file mode 100644 index d5d6c77bb..000000000 --- a/src/video_core/textures/convert.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCore::Surface { -enum class PixelFormat; -} - -namespace Tegra::Texture { - -void ConvertFromGuestToHost(u8* in_data, u8* out_data, VideoCore::Surface::PixelFormat pixel_format, - u32 width, u32 height, u32 depth, bool convert_astc, - bool convert_s8z24); - -void ConvertFromHostToGuest(u8* data, VideoCore::Surface::PixelFormat pixel_format, u32 width, - u32 height, u32 depth, bool convert_astc, bool convert_s8z24); - -} // namespace Tegra::Texture diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 16d46a018..62685a183 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -2,204 +2,111 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> #include <cmath> #include <cstring> +#include <span> +#include <utility> + #include "common/alignment.h" #include "common/assert.h" #include "common/bit_util.h" +#include "common/div_ceil.h" #include "video_core/gpu.h" #include "video_core/textures/decoders.h" #include "video_core/textures/texture.h" namespace Tegra::Texture { -namespace { +namespace { /** - * This table represents the internal swizzle of a gob, - * in format 16 bytes x 2 sector packing. + * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing. * Calculates the offset of an (x, y) position within a swizzled texture. * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188 */ -template <std::size_t N, std::size_t M, u32 Align> -struct alignas(64) SwizzleTable { - static_assert(M * Align == 64, "Swizzle Table does not align to GOB"); - constexpr SwizzleTable() { - for (u32 y = 0; y < N; ++y) { - for (u32 x = 0; x < M; ++x) { - const u32 x2 = x * Align; - values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 + - ((x2 % 32) / 16) * 32 + (y % 2) * 16 + (x2 % 16)); - } +constexpr SwizzleTable MakeSwizzleTableConst() { + SwizzleTable table{}; + for (u32 y = 0; y < table.size(); ++y) { + for (u32 x = 0; x < table[0].size(); ++x) { + table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + + (y % 2) * 16 + (x % 16); } } - const std::array<u16, M>& operator[](std::size_t index) const { - return values[index]; - } - std::array<std::array<u16, M>, N> values{}; -}; + return table; +} -constexpr u32 FAST_SWIZZLE_ALIGN = 16; +constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst(); -constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>(); -constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>(); +template <bool TO_LINEAR> +void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { + // The origin of the transformation can be configured here, leave it as zero as the current API + // doesn't expose it. + static constexpr u32 origin_x = 0; + static constexpr u32 origin_y = 0; + static constexpr u32 origin_z = 0; -/** - * This function manages ALL the GOBs(Group of Bytes) Inside a single block. - * Instead of going gob by gob, we map the coordinates inside a block and manage from - * those. Block_Width is assumed to be 1. - */ -void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, - const u32 y_end, const u32 z_end, const u32 tile_offset, - const u32 xy_block_size, const u32 layer_z, const u32 stride_x, - const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { - std::array<u8*, 2> data_ptrs; - u32 z_address = tile_offset; - - for (u32 z = z_start; z < z_end; z++) { - u32 y_address = z_address; - u32 pixel_base = layer_z * z + y_start * stride_x; - for (u32 y = y_start; y < y_end; y++) { - const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; - for (u32 x = x_start; x < x_end; x++) { - const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]}; - const u32 pixel_index{x * out_bytes_per_pixel + pixel_base}; - data_ptrs[unswizzle] = swizzled_data + swizzle_offset; - data_ptrs[!unswizzle] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); - } - pixel_base += stride_x; - if ((y + 1) % GOB_SIZE_Y == 0) - y_address += GOB_SIZE; - } - z_address += xy_block_size; - } -} + // We can configure here a custom pitch + // As it's not exposed 'width * bpp' will be the expected pitch. + const u32 pitch = width * bytes_per_pixel; + const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; -/** - * This function manages ALL the GOBs(Group of Bytes) Inside a single block. - * Instead of going gob by gob, we map the coordinates inside a block and manage from - * those. Block_Width is assumed to be 1. - */ -void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, - const u32 y_end, const u32 z_end, const u32 tile_offset, - const u32 xy_block_size, const u32 layer_z, const u32 stride_x, - const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { - std::array<u8*, 2> data_ptrs; - u32 z_address = tile_offset; - const u32 x_startb = x_start * bytes_per_pixel; - const u32 x_endb = x_end * bytes_per_pixel; - - for (u32 z = z_start; z < z_end; z++) { - u32 y_address = z_address; - u32 pixel_base = layer_z * z + y_start * stride_x; - for (u32 y = y_start; y < y_end; y++) { - const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y]; - for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) { - const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]}; - const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; - const u32 pixel_index{out_x + pixel_base}; - data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset; - data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN); - } - pixel_base += stride_x; - if ((y + 1) % GOB_SIZE_Y == 0) - y_address += GOB_SIZE; - } - z_address += xy_block_size; - } -} + const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); + const u32 slice_size = + Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size; -/** - * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue. - * The body of this function takes care of splitting the swizzled texture into blocks, - * and managing the extents of it. Once all the parameters of a single block are obtained, - * the function calls 'ProcessBlock' to process that particular Block. - * - * Documentation for the memory layout and decoding can be found at: - * https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces - */ -template <bool fast> -void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 width, const u32 height, const u32 depth, const u32 bytes_per_pixel, - const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth, - const u32 width_spacing) { - auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; - const u32 stride_x = width * out_bytes_per_pixel; - const u32 layer_z = height * stride_x; - const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel; - constexpr u32 gob_elements_y = GOB_SIZE_Y; - constexpr u32 gob_elements_z = GOB_SIZE_Z; - const u32 block_x_elements = gob_elements_x; - const u32 block_y_elements = gob_elements_y * block_height; - const u32 block_z_elements = gob_elements_z * block_depth; - const u32 aligned_width = Common::AlignUp(width, gob_elements_x * width_spacing); - const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements); - const u32 blocks_on_y = div_ceil(height, block_y_elements); - const u32 blocks_on_z = div_ceil(depth, block_z_elements); - const u32 xy_block_size = GOB_SIZE * block_height; - const u32 block_size = xy_block_size * block_depth; - u32 tile_offset = 0; - for (u32 zb = 0; zb < blocks_on_z; zb++) { - const u32 z_start = zb * block_z_elements; - const u32 z_end = std::min(depth, z_start + block_z_elements); - for (u32 yb = 0; yb < blocks_on_y; yb++) { - const u32 y_start = yb * block_y_elements; - const u32 y_end = std::min(height, y_start + block_y_elements); - for (u32 xb = 0; xb < blocks_on_x; xb++) { - const u32 x_start = xb * block_x_elements; - const u32 x_end = std::min(width, x_start + block_x_elements); - if constexpr (fast) { - FastProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, - z_start, x_end, y_end, z_end, tile_offset, xy_block_size, - layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); - } else { - PreciseProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, - z_start, x_end, y_end, z_end, tile_offset, xy_block_size, - layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); - } - tile_offset += block_size; + const u32 block_height_mask = (1U << block_height) - 1; + const u32 block_depth_mask = (1U << block_depth) - 1; + const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth; + + for (u32 slice = 0; slice < depth; ++slice) { + const u32 z = slice + origin_z; + const u32 offset_z = (z >> block_depth) * slice_size + + ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height)); + for (u32 line = 0; line < height; ++line) { + const u32 y = line + origin_y; + const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y]; + + const u32 block_y = y >> GOB_SIZE_Y_SHIFT; + const u32 offset_y = (block_y >> block_height) * block_size + + ((block_y & block_height_mask) << GOB_SIZE_SHIFT); + + for (u32 column = 0; column < width; ++column) { + const u32 x = (column + origin_x) * bytes_per_pixel; + const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; + + const u32 base_swizzled_offset = offset_z + offset_y + offset_x; + const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; + + const u32 unswizzled_offset = + slice * pitch * height + line * pitch + column * bytes_per_pixel; + + u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; + const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; + std::memcpy(dst, src, bytes_per_pixel); } } } } - } // Anonymous namespace -void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, - u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) { - const u32 block_height_size{1U << block_height}; - const u32 block_depth_size{1U << block_depth}; - if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) { - SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height_size, - block_depth_size, width_spacing); - } else { - SwizzledData<false>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height_size, - block_depth_size, width_spacing); - } +SwizzleTable MakeSwizzleTable() { + return SWIZZLE_TABLE; } -void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, - u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, - u32 block_depth, u32 width_spacing) { - CopySwizzledData((width + tile_size_x - 1) / tile_size_x, - (height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel, - bytes_per_pixel, address, unswizzled_data, true, block_height, block_depth, - width_spacing); +void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, + u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); } -std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel, - u32 width, u32 height, u32 depth, u32 block_height, - u32 block_depth, u32 width_spacing) { - std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel); - UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel, - width, height, depth, block_height, block_depth, width_spacing); - return unswizzled_data; +void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); } void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, @@ -213,7 +120,7 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 const u32 gob_address_y = (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; - const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y]; for (u32 x = 0; x < subrect_width; ++x) { const u32 dst_x = x + offset_x; const u32 gob_address = @@ -235,11 +142,11 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height); const u32 block_height_mask = (1U << block_height) - 1; - const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height; + const u32 x_shift = GOB_SIZE_SHIFT + block_height; for (u32 line = 0; line < line_count; ++line) { const u32 src_y = line + origin_y; - const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y]; const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT; const u32 src_offset_y = (block_y >> block_height) * block_size + @@ -270,7 +177,7 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth; for (u32 line = 0; line < line_count; ++line) { - const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y]; const u32 block_y = line / GOB_SIZE_Y; const u32 dst_offset_y = (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE; @@ -293,7 +200,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 const std::size_t gob_address_y = (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; - const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y]; for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { const std::size_t gob_address = gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height; @@ -310,9 +217,9 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth) { if (tiled) { - const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, GOB_SIZE_X_SHIFT); - const u32 aligned_height = Common::AlignBits(height, GOB_SIZE_Y_SHIFT + block_height); - const u32 aligned_depth = Common::AlignBits(depth, GOB_SIZE_Z_SHIFT + block_depth); + const u32 aligned_width = Common::AlignUpLog2(width * bytes_per_pixel, GOB_SIZE_X_SHIFT); + const u32 aligned_height = Common::AlignUpLog2(height, GOB_SIZE_Y_SHIFT + block_height); + const u32 aligned_depth = Common::AlignUpLog2(depth, GOB_SIZE_Z_SHIFT + block_depth); return aligned_width * aligned_height * aligned_depth; } else { return width * height * depth * bytes_per_pixel; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 01e156bc8..d7cdc81e8 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -4,7 +4,8 @@ #pragma once -#include <vector> +#include <span> + #include "common/common_types.h" #include "video_core/textures/texture.h" @@ -15,28 +16,25 @@ constexpr u32 GOB_SIZE_Y = 8; constexpr u32 GOB_SIZE_Z = 1; constexpr u32 GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; -constexpr std::size_t GOB_SIZE_X_SHIFT = 6; -constexpr std::size_t GOB_SIZE_Y_SHIFT = 3; -constexpr std::size_t GOB_SIZE_Z_SHIFT = 0; -constexpr std::size_t GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; - -/// Unswizzles a swizzled texture without changing its format. -void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, - u32 bytes_per_pixel, u32 width, u32 height, u32 depth, - u32 block_height = TICEntry::DefaultBlockHeight, - u32 block_depth = TICEntry::DefaultBlockHeight, u32 width_spacing = 0); - -/// Unswizzles a swizzled texture without changing its format. -std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel, - u32 width, u32 height, u32 depth, - u32 block_height = TICEntry::DefaultBlockHeight, - u32 block_depth = TICEntry::DefaultBlockHeight, - u32 width_spacing = 0); - -/// Copies texture data from a buffer and performs swizzling/unswizzling as necessary. -void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, - u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing); +constexpr u32 GOB_SIZE_X_SHIFT = 6; +constexpr u32 GOB_SIZE_Y_SHIFT = 3; +constexpr u32 GOB_SIZE_Z_SHIFT = 0; +constexpr u32 GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +using SwizzleTable = std::array<std::array<u32, GOB_SIZE_X>, GOB_SIZE_Y>; + +/// Returns a z-order swizzle table +SwizzleTable MakeSwizzleTable(); + +/// Unswizzles a block linear texture into linear memory. +void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, + u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment = 1); + +/// Swizzles linear memory into a block linear texture. +void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment = 1); /// This function calculates the correct size of a texture depending if it's tiled or not. std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index 4171e3ef2..ae5621a7d 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -5,9 +5,13 @@ #include <algorithm> #include <array> +#include "common/cityhash.h" #include "core/settings.h" #include "video_core/textures/texture.h" +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; + namespace Tegra::Texture { namespace { @@ -65,7 +69,7 @@ unsigned SettingsMinimumAnisotropy() noexcept { } // Anonymous namespace -std::array<float, 4> TSCEntry::GetBorderColor() const noexcept { +std::array<float, 4> TSCEntry::BorderColor() const noexcept { if (!srgb_conversion) { return border_color; } @@ -73,8 +77,16 @@ std::array<float, 4> TSCEntry::GetBorderColor() const noexcept { SRGB_CONVERSION_LUT[srgb_border_color_b], border_color[3]}; } -float TSCEntry::GetMaxAnisotropy() const noexcept { +float TSCEntry::MaxAnisotropy() const noexcept { return static_cast<float>(std::max(1U << max_anisotropy, SettingsMinimumAnisotropy())); } } // namespace Tegra::Texture + +size_t std::hash<TICEntry>::operator()(const TICEntry& tic) const noexcept { + return Common::CityHash64(reinterpret_cast<const char*>(&tic), sizeof tic); +} + +size_t std::hash<TSCEntry>::operator()(const TSCEntry& tsc) const noexcept { + return Common::CityHash64(reinterpret_cast<const char*>(&tsc), sizeof tsc); +} diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 0574fef12..c1d14335e 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -53,27 +53,27 @@ enum class TextureFormat : u32 { BC4 = 0x27, BC5 = 0x28, S8D24 = 0x29, - X8Z24 = 0x2a, + X8D24 = 0x2a, D24S8 = 0x2b, - X4V4Z24__COV4R4V = 0x2c, - X4V4Z24__COV8R8V = 0x2d, - V8Z24__COV4R12V = 0x2e, + X4V4D24__COV4R4V = 0x2c, + X4V4D24__COV8R8V = 0x2d, + V8D24__COV4R12V = 0x2e, D32 = 0x2f, D32S8 = 0x30, - X8Z24_X20V4S8__COV4R4V = 0x31, - X8Z24_X20V4S8__COV8R8V = 0x32, - ZF32_X20V4X8__COV4R4V = 0x33, - ZF32_X20V4X8__COV8R8V = 0x34, - ZF32_X20V4S8__COV4R4V = 0x35, - ZF32_X20V4S8__COV8R8V = 0x36, - X8Z24_X16V8S8__COV4R12V = 0x37, - ZF32_X16V8X8__COV4R12V = 0x38, - ZF32_X16V8S8__COV4R12V = 0x39, + X8D24_X20V4S8__COV4R4V = 0x31, + X8D24_X20V4S8__COV8R8V = 0x32, + D32_X20V4X8__COV4R4V = 0x33, + D32_X20V4X8__COV8R8V = 0x34, + D32_X20V4S8__COV4R4V = 0x35, + D32_X20V4S8__COV8R8V = 0x36, + X8D24_X16V8S8__COV4R12V = 0x37, + D32_X16V8X8__COV4R12V = 0x38, + D32_X16V8S8__COV4R12V = 0x39, D16 = 0x3a, - V8Z24__COV8R24V = 0x3b, - X8Z24_X16V8S8__COV8R24V = 0x3c, - ZF32_X16V8X8__COV8R24V = 0x3d, - ZF32_X16V8S8__COV8R24V = 0x3e, + V8D24__COV8R24V = 0x3b, + X8D24_X16V8S8__COV8R24V = 0x3c, + D32_X16V8X8__COV8R24V = 0x3d, + D32_X16V8S8__COV8R24V = 0x3e, ASTC_2D_4X4 = 0x40, ASTC_2D_5X5 = 0x41, ASTC_2D_6X6 = 0x42, @@ -146,7 +146,7 @@ enum class MsaaMode : u32 { }; union TextureHandle { - TextureHandle(u32 raw) : raw{raw} {} + /* implicit */ constexpr TextureHandle(u32 raw_) : raw{raw_} {} u32 raw; BitField<0, 20, u32> tic_id; @@ -155,124 +155,124 @@ union TextureHandle { static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size"); struct TICEntry { - static constexpr u32 DefaultBlockHeight = 16; - static constexpr u32 DefaultBlockDepth = 1; - - union { - u32 raw; - BitField<0, 7, TextureFormat> format; - BitField<7, 3, ComponentType> r_type; - BitField<10, 3, ComponentType> g_type; - BitField<13, 3, ComponentType> b_type; - BitField<16, 3, ComponentType> a_type; - - BitField<19, 3, SwizzleSource> x_source; - BitField<22, 3, SwizzleSource> y_source; - BitField<25, 3, SwizzleSource> z_source; - BitField<28, 3, SwizzleSource> w_source; - }; - u32 address_low; union { - BitField<0, 16, u32> address_high; - BitField<21, 3, TICHeaderVersion> header_version; - }; - union { - BitField<0, 3, u32> block_width; - BitField<3, 3, u32> block_height; - BitField<6, 3, u32> block_depth; + struct { + union { + BitField<0, 7, TextureFormat> format; + BitField<7, 3, ComponentType> r_type; + BitField<10, 3, ComponentType> g_type; + BitField<13, 3, ComponentType> b_type; + BitField<16, 3, ComponentType> a_type; + + BitField<19, 3, SwizzleSource> x_source; + BitField<22, 3, SwizzleSource> y_source; + BitField<25, 3, SwizzleSource> z_source; + BitField<28, 3, SwizzleSource> w_source; + }; + u32 address_low; + union { + BitField<0, 16, u32> address_high; + BitField<16, 5, u32> layer_base_3_7; + BitField<21, 3, TICHeaderVersion> header_version; + BitField<24, 1, u32> load_store_hint; + BitField<25, 4, u32> view_coherency_hash; + BitField<29, 3, u32> layer_base_8_10; + }; + union { + BitField<0, 3, u32> block_width; + BitField<3, 3, u32> block_height; + BitField<6, 3, u32> block_depth; - BitField<10, 3, u32> tile_width_spacing; + BitField<10, 3, u32> tile_width_spacing; - // High 16 bits of the pitch value - BitField<0, 16, u32> pitch_high; - BitField<26, 1, u32> use_header_opt_control; - BitField<27, 1, u32> depth_texture; - BitField<28, 4, u32> max_mip_level; + // High 16 bits of the pitch value + BitField<0, 16, u32> pitch_high; + BitField<26, 1, u32> use_header_opt_control; + BitField<27, 1, u32> depth_texture; + BitField<28, 4, u32> max_mip_level; - BitField<0, 16, u32> buffer_high_width_minus_one; - }; - union { - BitField<0, 16, u32> width_minus_1; - BitField<22, 1, u32> srgb_conversion; - BitField<23, 4, TextureType> texture_type; - BitField<29, 3, u32> border_size; + BitField<0, 16, u32> buffer_high_width_minus_one; + }; + union { + BitField<0, 16, u32> width_minus_one; + BitField<16, 3, u32> layer_base_0_2; + BitField<22, 1, u32> srgb_conversion; + BitField<23, 4, TextureType> texture_type; + BitField<29, 3, u32> border_size; - BitField<0, 16, u32> buffer_low_width_minus_one; - }; - union { - BitField<0, 16, u32> height_minus_1; - BitField<16, 14, u32> depth_minus_1; - }; - union { - BitField<6, 13, u32> mip_lod_bias; - BitField<27, 3, u32> max_anisotropy; + BitField<0, 16, u32> buffer_low_width_minus_one; + }; + union { + BitField<0, 16, u32> height_minus_1; + BitField<16, 14, u32> depth_minus_1; + BitField<30, 1, u32> is_sparse; + BitField<31, 1, u32> normalized_coords; + }; + union { + BitField<6, 13, u32> mip_lod_bias; + BitField<27, 3, u32> max_anisotropy; + }; + union { + BitField<0, 4, u32> res_min_mip_level; + BitField<4, 4, u32> res_max_mip_level; + BitField<8, 4, MsaaMode> msaa_mode; + BitField<12, 12, u32> min_lod_clamp; + }; + }; + std::array<u64, 4> raw; }; - union { - BitField<0, 4, u32> res_min_mip_level; - BitField<4, 4, u32> res_max_mip_level; - BitField<8, 4, MsaaMode> msaa_mode; - BitField<12, 12, u32> min_lod_clamp; - }; + constexpr bool operator==(const TICEntry& rhs) const noexcept { + return raw == rhs.raw; + } - GPUVAddr Address() const { + constexpr bool operator!=(const TICEntry& rhs) const noexcept { + return raw != rhs.raw; + } + + constexpr GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } - u32 Pitch() const { + constexpr u32 Pitch() const { ASSERT(header_version == TICHeaderVersion::Pitch || header_version == TICHeaderVersion::PitchColorKey); // The pitch value is 21 bits, and is 32B aligned. return pitch_high << 5; } - u32 Width() const { + constexpr u32 Width() const { if (header_version != TICHeaderVersion::OneDBuffer) { - return width_minus_1 + 1; + return width_minus_one + 1; } - return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1; + return (buffer_high_width_minus_one << 16 | buffer_low_width_minus_one) + 1; } - u32 Height() const { + constexpr u32 Height() const { return height_minus_1 + 1; } - u32 Depth() const { + constexpr u32 Depth() const { return depth_minus_1 + 1; } - u32 BlockWidth() const { - ASSERT(IsTiled()); - return block_width; - } - - u32 BlockHeight() const { - ASSERT(IsTiled()); - return block_height; - } - - u32 BlockDepth() const { - ASSERT(IsTiled()); - return block_depth; + constexpr u32 BaseLayer() const { + return layer_base_0_2 | layer_base_3_7 << 3 | layer_base_8_10 << 8; } - bool IsTiled() const { + constexpr bool IsBlockLinear() const { return header_version == TICHeaderVersion::BlockLinear || header_version == TICHeaderVersion::BlockLinearColorKey; } - bool IsLineal() const { + constexpr bool IsPitchLinear() const { return header_version == TICHeaderVersion::Pitch || header_version == TICHeaderVersion::PitchColorKey; } - bool IsBuffer() const { + constexpr bool IsBuffer() const { return header_version == TICHeaderVersion::OneDBuffer; } - - bool IsSrgbConversionEnabled() const { - return srgb_conversion != 0; - } }; static_assert(sizeof(TICEntry) == 0x20, "TICEntry has wrong size"); @@ -309,6 +309,12 @@ enum class TextureMipmapFilter : u32 { Linear = 3, }; +enum class SamplerReduction : u32 { + WeightedAverage = 0, + Min = 1, + Max = 2, +}; + enum class Anisotropy { Default, Filter2x, @@ -333,8 +339,12 @@ struct TSCEntry { BitField<0, 2, TextureFilter> mag_filter; BitField<4, 2, TextureFilter> min_filter; BitField<6, 2, TextureMipmapFilter> mipmap_filter; + BitField<8, 1, u32> cubemap_anisotropy; BitField<9, 1, u32> cubemap_interface_filtering; + BitField<10, 2, SamplerReduction> reduction_filter; BitField<12, 13, u32> mip_lod_bias; + BitField<25, 1, u32> float_coord_normalization; + BitField<26, 5, u32> trilin_opt; }; union { BitField<0, 12, u32> min_lod_clamp; @@ -347,32 +357,45 @@ struct TSCEntry { }; std::array<f32, 4> border_color; }; - std::array<u8, 0x20> raw; + std::array<u64, 4> raw; }; - std::array<float, 4> GetBorderColor() const noexcept; + constexpr bool operator==(const TSCEntry& rhs) const noexcept { + return raw == rhs.raw; + } + + constexpr bool operator!=(const TSCEntry& rhs) const noexcept { + return raw != rhs.raw; + } + + std::array<float, 4> BorderColor() const noexcept; - float GetMaxAnisotropy() const noexcept; + float MaxAnisotropy() const noexcept; - float GetMinLod() const { + float MinLod() const { return static_cast<float>(min_lod_clamp) / 256.0f; } - float GetMaxLod() const { + float MaxLod() const { return static_cast<float>(max_lod_clamp) / 256.0f; } - float GetLodBias() const { + float LodBias() const { // Sign extend the 13-bit value. - constexpr u32 mask = 1U << (13 - 1); + static constexpr u32 mask = 1U << (13 - 1); return static_cast<float>(static_cast<s32>((mip_lod_bias ^ mask) - mask)) / 256.0f; } }; static_assert(sizeof(TSCEntry) == 0x20, "TSCEntry has wrong size"); -struct FullTextureInfo { - TICEntry tic; - TSCEntry tsc; +} // namespace Tegra::Texture + +template <> +struct std::hash<Tegra::Texture::TICEntry> { + size_t operator()(const Tegra::Texture::TICEntry& tic) const noexcept; }; -} // namespace Tegra::Texture +template <> +struct std::hash<Tegra::Texture::TSCEntry> { + size_t operator()(const Tegra::Texture::TSCEntry& tsc) const noexcept; +}; diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 4e3a092c7..e1b38c6ac 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -7,13 +7,9 @@ #include "common/logging/log.h" #include "core/core.h" #include "core/settings.h" -#include "video_core/gpu_asynch.h" -#include "video_core/gpu_synch.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/renderer_opengl.h" -#ifdef HAS_VULKAN #include "video_core/renderer_vulkan/renderer_vulkan.h" -#endif #include "video_core/video_core.h" namespace { @@ -21,15 +17,16 @@ namespace { std::unique_ptr<VideoCore::RendererBase> CreateRenderer( Core::System& system, Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, std::unique_ptr<Core::Frontend::GraphicsContext> context) { + auto& telemetry_session = system.TelemetrySession(); + auto& cpu_memory = system.Memory(); + switch (Settings::values.renderer_backend.GetValue()) { case Settings::RendererBackend::OpenGL: - return std::make_unique<OpenGL::RendererOpenGL>(system, emu_window, gpu, - std::move(context)); -#ifdef HAS_VULKAN + return std::make_unique<OpenGL::RendererOpenGL>(telemetry_session, emu_window, cpu_memory, + gpu, std::move(context)); case Settings::RendererBackend::Vulkan: - return std::make_unique<Vulkan::RendererVulkan>(system, emu_window, gpu, - std::move(context)); -#endif + return std::make_unique<Vulkan::RendererVulkan>(telemetry_session, emu_window, cpu_memory, + gpu, std::move(context)); default: return nullptr; } @@ -40,23 +37,19 @@ std::unique_ptr<VideoCore::RendererBase> CreateRenderer( namespace VideoCore { std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { - std::unique_ptr<Tegra::GPU> gpu; - if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - gpu = std::make_unique<VideoCommon::GPUAsynch>(system); - } else { - gpu = std::make_unique<VideoCommon::GPUSynch>(system); - } - + const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue(); + const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + auto gpu = std::make_unique<Tegra::GPU>(system, use_async, use_nvdec); auto context = emu_window.CreateSharedContext(); - const auto scope = context->Acquire(); - - auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); - if (!renderer->Init()) { + auto scope = context->Acquire(); + try { + auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); + gpu->BindRenderer(std::move(renderer)); + return gpu; + } catch (const std::runtime_error& exception) { + LOG_ERROR(HW_GPU, "Failed to initialize GPU: {}", exception.what()); return nullptr; } - - gpu->BindRenderer(std::move(renderer)); - return gpu; } u16 GetResolutionScaleFactor(const RendererBase& renderer) { diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp index 5b01020ec..7a9d00d4f 100644 --- a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp @@ -12,40 +12,22 @@ #include <fmt/format.h> -#define VK_NO_PROTOTYPES -#include <vulkan/vulkan.h> - -#include <GFSDK_Aftermath.h> -#include <GFSDK_Aftermath_Defines.h> -#include <GFSDK_Aftermath_GpuCrashDump.h> -#include <GFSDK_Aftermath_GpuCrashDumpDecoding.h> - #include "common/common_paths.h" #include "common/common_types.h" #include "common/file_util.h" #include "common/logging/log.h" #include "common/scope_exit.h" - -#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h" +#include "video_core/vulkan_common/nsight_aftermath_tracker.h" namespace Vulkan { static constexpr char AFTERMATH_LIB_NAME[] = "GFSDK_Aftermath_Lib.x64.dll"; -NsightAftermathTracker::NsightAftermathTracker() = default; - -NsightAftermathTracker::~NsightAftermathTracker() { - if (initialized) { - (void)GFSDK_Aftermath_DisableGpuCrashDumps(); - } -} - -bool NsightAftermathTracker::Initialize() { +NsightAftermathTracker::NsightAftermathTracker() { if (!dl.Open(AFTERMATH_LIB_NAME)) { LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath DLL"); - return false; + return; } - if (!dl.GetSymbol("GFSDK_Aftermath_DisableGpuCrashDumps", &GFSDK_Aftermath_DisableGpuCrashDumps) || !dl.GetSymbol("GFSDK_Aftermath_EnableGpuCrashDumps", @@ -62,29 +44,30 @@ bool NsightAftermathTracker::Initialize() { !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_GetJSON", &GFSDK_Aftermath_GpuCrashDump_GetJSON)) { LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath function pointers"); - return false; + return; } - dump_dir = Common::FS::GetUserPath(Common::FS::UserPath::LogDir) + "gpucrash"; - (void)Common::FS::DeleteDirRecursively(dump_dir); + void(Common::FS::DeleteDirRecursively(dump_dir)); if (!Common::FS::CreateDir(dump_dir)) { LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory"); - return false; + return; } - if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_EnableGpuCrashDumps( GFSDK_Aftermath_Version_API, GFSDK_Aftermath_GpuCrashDumpWatchedApiFlags_Vulkan, GFSDK_Aftermath_GpuCrashDumpFeatureFlags_Default, GpuCrashDumpCallback, ShaderDebugInfoCallback, CrashDumpDescriptionCallback, this))) { LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_EnableGpuCrashDumps failed"); - return false; + return; } - LOG_INFO(Render_Vulkan, "Nsight Aftermath dump directory is \"{}\"", dump_dir); - initialized = true; - return true; +} + +NsightAftermathTracker::~NsightAftermathTracker() { + if (initialized) { + (void)GFSDK_Aftermath_DisableGpuCrashDumps(); + } } void NsightAftermathTracker::SaveShader(const std::vector<u32>& spirv) const { diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h b/src/video_core/vulkan_common/nsight_aftermath_tracker.h index afe7ae99e..1ce8d4e8e 100644 --- a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.h @@ -8,8 +8,9 @@ #include <string> #include <vector> -#define VK_NO_PROTOTYPES -#include <vulkan/vulkan.h> +#include "common/common_types.h" +#include "common/dynamic_library.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" #ifdef HAS_NSIGHT_AFTERMATH #include <GFSDK_Aftermath_Defines.h> @@ -17,9 +18,6 @@ #include <GFSDK_Aftermath_GpuCrashDumpDecoding.h> #endif -#include "common/common_types.h" -#include "common/dynamic_library.h" - namespace Vulkan { class NsightAftermathTracker { @@ -34,8 +32,6 @@ public: NsightAftermathTracker(NsightAftermathTracker&&) = delete; NsightAftermathTracker& operator=(NsightAftermathTracker&&) = delete; - bool Initialize(); - void SaveShader(const std::vector<u32>& spirv) const; private: @@ -78,9 +74,6 @@ private: #ifndef HAS_NSIGHT_AFTERMATH inline NsightAftermathTracker::NsightAftermathTracker() = default; inline NsightAftermathTracker::~NsightAftermathTracker() = default; -inline bool NsightAftermathTracker::Initialize() { - return false; -} inline void NsightAftermathTracker::SaveShader(const std::vector<u32>&) const {} #endif diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp new file mode 100644 index 000000000..5c64c9bf7 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -0,0 +1,46 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <string_view> +#include "common/logging/log.h" +#include "video_core/vulkan_common/vulkan_debug_callback.h" + +namespace Vulkan { +namespace { +VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { + const std::string_view message{data->pMessage}; + if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} +} // Anonymous namespace + +vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { + return instance.CreateDebugUtilsMessenger(VkDebugUtilsMessengerCreateInfoEXT{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = 0, + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + .pfnUserCallback = Callback, + .pUserData = nullptr, + }); +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h new file mode 100644 index 000000000..b0519f132 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 030b4dbd3..34d396434 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -13,39 +13,47 @@ #include "common/assert.h" #include "core/settings.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/nsight_aftermath_tracker.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - namespace Alternatives { - -constexpr std::array Depth24UnormS8_UINT{ +constexpr std::array DEPTH24_UNORM_STENCIL8_UINT{ VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D16_UNORM_S8_UINT, - VkFormat{}, + VK_FORMAT_UNDEFINED, }; -constexpr std::array Depth16UnormS8_UINT{ +constexpr std::array DEPTH16_UNORM_STENCIL8_UINT{ VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VkFormat{}, + VK_FORMAT_UNDEFINED, }; - } // namespace Alternatives constexpr std::array REQUIRED_EXTENSIONS{ - VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_KHR_MAINTENANCE1_EXTENSION_NAME, + VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME, + VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, VK_KHR_16BIT_STORAGE_EXTENSION_NAME, VK_KHR_8BIT_STORAGE_EXTENSION_NAME, VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME, VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME, + VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME, + VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, + VK_EXT_ROBUSTNESS_2_EXTENSION_NAME, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, +#ifdef _WIN32 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, +#endif +#ifdef __linux__ + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, +#endif }; template <typename T> @@ -57,9 +65,9 @@ void SetNext(void**& next, T& data) { constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { switch (format) { case VK_FORMAT_D24_UNORM_S8_UINT: - return Alternatives::Depth24UnormS8_UINT.data(); + return Alternatives::DEPTH24_UNORM_STENCIL8_UINT.data(); case VK_FORMAT_D16_UNORM_S8_UINT: - return Alternatives::Depth16UnormS8_UINT.data(); + return Alternatives::DEPTH16_UNORM_STENCIL8_UINT.data(); default: return nullptr; } @@ -78,8 +86,7 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType } } -std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( - vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { +std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(vk::PhysicalDevice physical) { static constexpr std::array formats{ VK_FORMAT_A8B8G8R8_UNORM_PACK32, VK_FORMAT_A8B8G8R8_UINT_PACK32, @@ -103,6 +110,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( VK_FORMAT_R16G16_UNORM, VK_FORMAT_R16G16_SNORM, VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R16G16_SINT, VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UINT, VK_FORMAT_R8G8B8A8_SRGB, @@ -142,18 +150,32 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( VK_FORMAT_BC2_SRGB_BLOCK, VK_FORMAT_BC3_SRGB_BLOCK, VK_FORMAT_BC7_SRGB_BLOCK, + VK_FORMAT_ASTC_4x4_UNORM_BLOCK, VK_FORMAT_ASTC_4x4_SRGB_BLOCK, - VK_FORMAT_ASTC_8x8_SRGB_BLOCK, - VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_5x4_UNORM_BLOCK, VK_FORMAT_ASTC_5x4_SRGB_BLOCK, VK_FORMAT_ASTC_5x5_UNORM_BLOCK, VK_FORMAT_ASTC_5x5_SRGB_BLOCK, - VK_FORMAT_ASTC_10x8_UNORM_BLOCK, - VK_FORMAT_ASTC_10x8_SRGB_BLOCK, + VK_FORMAT_ASTC_6x5_UNORM_BLOCK, + VK_FORMAT_ASTC_6x5_SRGB_BLOCK, VK_FORMAT_ASTC_6x6_UNORM_BLOCK, VK_FORMAT_ASTC_6x6_SRGB_BLOCK, + VK_FORMAT_ASTC_8x5_UNORM_BLOCK, + VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_8x6_UNORM_BLOCK, + VK_FORMAT_ASTC_8x6_SRGB_BLOCK, + VK_FORMAT_ASTC_8x8_UNORM_BLOCK, + VK_FORMAT_ASTC_8x8_SRGB_BLOCK, + VK_FORMAT_ASTC_10x5_UNORM_BLOCK, + VK_FORMAT_ASTC_10x5_SRGB_BLOCK, + VK_FORMAT_ASTC_10x6_UNORM_BLOCK, + VK_FORMAT_ASTC_10x6_SRGB_BLOCK, + VK_FORMAT_ASTC_10x8_UNORM_BLOCK, + VK_FORMAT_ASTC_10x8_SRGB_BLOCK, VK_FORMAT_ASTC_10x10_UNORM_BLOCK, VK_FORMAT_ASTC_10x10_SRGB_BLOCK, + VK_FORMAT_ASTC_12x10_UNORM_BLOCK, + VK_FORMAT_ASTC_12x10_SRGB_BLOCK, VK_FORMAT_ASTC_12x12_UNORM_BLOCK, VK_FORMAT_ASTC_12x12_SRGB_BLOCK, VK_FORMAT_ASTC_8x6_UNORM_BLOCK, @@ -171,84 +193,87 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( } // Anonymous namespace -VKDevice::VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld) - : dld{dld}, physical{physical}, properties{physical.GetProperties()}, - format_properties{GetFormatProperties(physical, dld)} { +Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR surface, + const vk::InstanceDispatch& dld_) + : instance{instance_}, dld{dld_}, physical{physical_}, properties{physical.GetProperties()}, + format_properties{GetFormatProperties(physical)} { + CheckSuitability(surface != nullptr); SetupFamilies(surface); SetupFeatures(); -} - -VKDevice::~VKDevice() = default; -bool VKDevice::Create() { const auto queue_cis = GetDeviceQueueCreateInfos(); - const std::vector extensions = LoadExtensions(); + const std::vector extensions = LoadExtensions(surface != nullptr); VkPhysicalDeviceFeatures2 features2{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, .pNext = nullptr, + .features{ + .robustBufferAccess = true, + .fullDrawIndexUint32 = false, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = true, + .sampleRateShading = false, + .dualSrcBlend = false, + .logicOp = false, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = false, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = false, + .depthBounds = false, + .wideLines = false, + .largePoints = true, + .alphaToOne = false, + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = false, + .textureCompressionASTC_LDR = is_optimal_astc_supported, + .textureCompressionBC = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = false, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = false, + .shaderStorageImageMultisample = is_shader_storage_image_multisample, + .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, + .shaderStorageImageWriteWithoutFormat = true, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderClipDistance = false, + .shaderCullDistance = false, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidency16Samples = false, + .sparseResidencyAliased = false, + .variableMultisampleRate = false, + .inheritedQueries = false, + }, }; const void* first_next = &features2; void** next = &features2.pNext; - features2.features = { - .robustBufferAccess = false, - .fullDrawIndexUint32 = false, - .imageCubeArray = false, - .independentBlend = true, - .geometryShader = true, - .tessellationShader = true, - .sampleRateShading = false, - .dualSrcBlend = false, - .logicOp = false, - .multiDrawIndirect = false, - .drawIndirectFirstInstance = false, - .depthClamp = true, - .depthBiasClamp = true, - .fillModeNonSolid = false, - .depthBounds = false, - .wideLines = false, - .largePoints = true, - .alphaToOne = false, - .multiViewport = true, - .samplerAnisotropy = true, - .textureCompressionETC2 = false, - .textureCompressionASTC_LDR = is_optimal_astc_supported, - .textureCompressionBC = false, - .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, - .vertexPipelineStoresAndAtomics = true, - .fragmentStoresAndAtomics = true, - .shaderTessellationAndGeometryPointSize = false, - .shaderImageGatherExtended = true, - .shaderStorageImageExtendedFormats = false, - .shaderStorageImageMultisample = false, - .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, - .shaderStorageImageWriteWithoutFormat = true, - .shaderUniformBufferArrayDynamicIndexing = false, - .shaderSampledImageArrayDynamicIndexing = false, - .shaderStorageBufferArrayDynamicIndexing = false, - .shaderStorageImageArrayDynamicIndexing = false, - .shaderClipDistance = false, - .shaderCullDistance = false, - .shaderFloat64 = false, - .shaderInt64 = false, - .shaderInt16 = false, - .shaderResourceResidency = false, - .shaderResourceMinLod = false, - .sparseBinding = false, - .sparseResidencyBuffer = false, - .sparseResidencyImage2D = false, - .sparseResidencyImage3D = false, - .sparseResidency2Samples = false, - .sparseResidency4Samples = false, - .sparseResidency8Samples = false, - .sparseResidency16Samples = false, - .sparseResidencyAliased = false, - .variableMultisampleRate = false, - .inheritedQueries = false, + VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR, + .pNext = nullptr, + .timelineSemaphore = true, }; + SetNext(next, timeline_semaphore); VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR, @@ -271,6 +296,7 @@ bool VKDevice::Create() { VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT, + .pNext = nullptr, .hostQueryReset = true, }; SetNext(next, host_query_reset); @@ -360,7 +386,7 @@ bool VKDevice::Create() { VkDeviceDiagnosticsConfigCreateInfoNV diagnostics_nv; if (nv_device_diagnostics_config) { - nsight_aftermath_tracker.Initialize(); + nsight_aftermath_tracker = std::make_unique<NsightAftermathTracker>(); diagnostics_nv = { .sType = VK_STRUCTURE_TYPE_DEVICE_DIAGNOSTICS_CONFIG_CREATE_INFO_NV, @@ -371,24 +397,33 @@ bool VKDevice::Create() { }; first_next = &diagnostics_nv; } - logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); - if (!logical) { - LOG_ERROR(Render_Vulkan, "Failed to create logical device"); - return false; - } CollectTelemetryParameters(); + CollectToolingInfo(); + + if (ext_extended_dynamic_state && driver_id == VK_DRIVER_ID_MESA_RADV) { + LOG_WARNING( + Render_Vulkan, + "Blacklisting RADV for VK_EXT_extended_dynamic state, likely due to a bug in yuzu"); + ext_extended_dynamic_state = false; + } + if (is_float16_supported && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + // Intel's compiler crashes when using fp16 on Astral Chain, disable it for the time being. + LOG_WARNING(Render_Vulkan, "Blacklisting Intel proprietary from float16 math"); + is_float16_supported = false; + } graphics_queue = logical.GetQueue(graphics_family); present_queue = logical.GetQueue(present_family); use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); - return true; } -VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, - FormatType format_type) const { +Device::~Device() = default; + +VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, + FormatType format_type) const { if (IsFormatSupported(wanted_format, wanted_usage, format_type)) { return wanted_format; } @@ -419,18 +454,20 @@ VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFla return wanted_format; } -void VKDevice::ReportLoss() const { +void Device::ReportLoss() const { LOG_CRITICAL(Render_Vulkan, "Device loss occured!"); // Wait for the log to flush and for Nsight Aftermath to dump the results - std::this_thread::sleep_for(std::chrono::seconds{3}); + std::this_thread::sleep_for(std::chrono::seconds{15}); } -void VKDevice::SaveShader(const std::vector<u32>& spirv) const { - nsight_aftermath_tracker.SaveShader(spirv); +void Device::SaveShader(const std::vector<u32>& spirv) const { + if (nsight_aftermath_tracker) { + nsight_aftermath_tracker->SaveShader(spirv); + } } -bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const { +bool Device::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const { // Disable for now to avoid converting ASTC twice. static constexpr std::array astc_formats = { VK_FORMAT_ASTC_4x4_UNORM_BLOCK, VK_FORMAT_ASTC_4x4_SRGB_BLOCK, @@ -456,16 +493,26 @@ bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) VK_FORMAT_FEATURE_BLIT_DST_BIT | VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT}; for (const auto format : astc_formats) { - const auto format_properties{physical.GetFormatProperties(format)}; - if (!(format_properties.optimalTilingFeatures & format_feature_usage)) { + const auto physical_format_properties{physical.GetFormatProperties(format)}; + if ((physical_format_properties.optimalTilingFeatures & format_feature_usage) == 0) { return false; } } return true; } -bool VKDevice::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, - FormatType format_type) const { +bool Device::TestDepthStencilBlits() const { + static constexpr VkFormatFeatureFlags required_features = + VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + const auto test_features = [](VkFormatProperties props) { + return (props.optimalTilingFeatures & required_features) == required_features; + }; + return test_features(format_properties.at(VK_FORMAT_D32_SFLOAT_S8_UINT)) && + test_features(format_properties.at(VK_FORMAT_D24_UNORM_S8_UINT)); +} + +bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, + FormatType format_type) const { const auto it = format_properties.find(wanted_format); if (it == format_properties.end()) { UNIMPLEMENTED_MSG("Unimplemented format query={}", wanted_format); @@ -475,65 +522,65 @@ bool VKDevice::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wa return (supported_usage & wanted_usage) == wanted_usage; } -bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) { - bool is_suitable = true; +void Device::CheckSuitability(bool requires_swapchain) const { std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions; - - for (const auto& prop : physical.EnumerateDeviceExtensionProperties()) { - for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { + bool has_swapchain = false; + for (const VkExtensionProperties& property : physical.EnumerateDeviceExtensionProperties()) { + const std::string_view name{property.extensionName}; + for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { if (available_extensions[i]) { continue; } - const std::string_view name{prop.extensionName}; available_extensions[i] = name == REQUIRED_EXTENSIONS[i]; } + has_swapchain = has_swapchain || name == VK_KHR_SWAPCHAIN_EXTENSION_NAME; } - if (!available_extensions.all()) { - for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { - if (available_extensions[i]) { - continue; - } - LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); - is_suitable = false; - } - } - - bool has_graphics{}, has_present{}; - const std::vector queue_family_properties = physical.GetQueueFamilyProperties(); - for (u32 i = 0; i < static_cast<u32>(queue_family_properties.size()); ++i) { - const auto& family = queue_family_properties[i]; - if (family.queueCount == 0) { + for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { + if (available_extensions[i]) { continue; } - has_graphics |= family.queueFlags & VK_QUEUE_GRAPHICS_BIT; - has_present |= physical.GetSurfaceSupportKHR(i, surface); + LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } - if (!has_graphics || !has_present) { - LOG_ERROR(Render_Vulkan, "Device lacks a graphics and present queue"); - is_suitable = false; + if (requires_swapchain && !has_swapchain) { + LOG_ERROR(Render_Vulkan, "Missing required extension: VK_KHR_swapchain"); + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } - // TODO(Rodrigo): Check if the device matches all requeriments. - const auto properties{physical.GetProperties()}; - const auto& limits{properties.limits}; - - constexpr u32 required_ubo_size = 65536; - if (limits.maxUniformBufferRange < required_ubo_size) { - LOG_ERROR(Render_Vulkan, "Device UBO size {} is too small, {} is required", - limits.maxUniformBufferRange, required_ubo_size); - is_suitable = false; + struct LimitTuple { + u32 minimum; + u32 value; + const char* name; + }; + const VkPhysicalDeviceLimits& limits{properties.limits}; + const std::array limits_report{ + LimitTuple{65536, limits.maxUniformBufferRange, "maxUniformBufferRange"}, + LimitTuple{16, limits.maxViewports, "maxViewports"}, + LimitTuple{8, limits.maxColorAttachments, "maxColorAttachments"}, + LimitTuple{8, limits.maxClipDistances, "maxClipDistances"}, + }; + for (const auto& tuple : limits_report) { + if (tuple.value < tuple.minimum) { + LOG_ERROR(Render_Vulkan, "{} has to be {} or greater but it is {}", tuple.name, + tuple.minimum, tuple.value); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } } + VkPhysicalDeviceRobustness2FeaturesEXT robustness2{}; + robustness2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT; - constexpr u32 required_num_viewports = 16; - if (limits.maxViewports < required_num_viewports) { - LOG_INFO(Render_Vulkan, "Device number of viewports {} is too small, {} is required", - limits.maxViewports, required_num_viewports); - is_suitable = false; - } + VkPhysicalDeviceFeatures2 features2{}; + features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + features2.pNext = &robustness2; + + physical.GetFeatures2KHR(features2); - const auto features{physical.GetFeatures()}; - const std::array feature_report = { + const VkPhysicalDeviceFeatures& features{features2.features}; + const std::array feature_report{ + std::make_pair(features.robustBufferAccess, "robustBufferAccess"), std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), + std::make_pair(features.robustBufferAccess, "robustBufferAccess"), + std::make_pair(features.imageCubeArray, "imageCubeArray"), std::make_pair(features.independentBlend, "independentBlend"), std::make_pair(features.depthClamp, "depthClamp"), std::make_pair(features.samplerAnisotropy, "samplerAnisotropy"), @@ -547,76 +594,70 @@ bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) { std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"), std::make_pair(features.shaderStorageImageWriteWithoutFormat, "shaderStorageImageWriteWithoutFormat"), + std::make_pair(robustness2.robustBufferAccess2, "robustBufferAccess2"), + std::make_pair(robustness2.robustImageAccess2, "robustImageAccess2"), + std::make_pair(robustness2.nullDescriptor, "nullDescriptor"), }; - for (const auto& [supported, name] : feature_report) { - if (supported) { + for (const auto& [is_supported, name] : feature_report) { + if (is_supported) { continue; } LOG_ERROR(Render_Vulkan, "Missing required feature: {}", name); - is_suitable = false; + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); } - - if (!is_suitable) { - LOG_ERROR(Render_Vulkan, "{} is not suitable", properties.deviceName); - } - - return is_suitable; } -std::vector<const char*> VKDevice::LoadExtensions() { +std::vector<const char*> Device::LoadExtensions(bool requires_surface) { std::vector<const char*> extensions; - const auto Test = [&](const VkExtensionProperties& extension, - std::optional<std::reference_wrapper<bool>> status, const char* name, - bool push) { - if (extension.extensionName != std::string_view(name)) { - return; - } - if (push) { - extensions.push_back(name); - } - if (status) { - status->get() = true; - } - }; - - extensions.reserve(7 + REQUIRED_EXTENSIONS.size()); + extensions.reserve(8 + REQUIRED_EXTENSIONS.size()); extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end()); + if (requires_surface) { + extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + } bool has_khr_shader_float16_int8{}; bool has_ext_subgroup_size_control{}; bool has_ext_transform_feedback{}; bool has_ext_custom_border_color{}; bool has_ext_extended_dynamic_state{}; - for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) { - Test(extension, nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); - Test(extension, khr_uniform_buffer_standard_layout, + for (const VkExtensionProperties& extension : physical.EnumerateDeviceExtensionProperties()) { + const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name, + bool push) { + if (extension.extensionName != std::string_view(name)) { + return; + } + if (push) { + extensions.push_back(name); + } + if (status) { + status->get() = true; + } + }; + test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); + test(khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, - false); - Test(extension, ext_depth_range_unrestricted, - VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); - Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); - Test(extension, ext_shader_viewport_index_layer, - VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, - false); - Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, - false); - Test(extension, has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, - false); - Test(extension, has_ext_extended_dynamic_state, - VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); + test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); + test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); + test(ext_sampler_filter_minmax, VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME, true); + test(ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, + true); + test(ext_tooling_info, VK_EXT_TOOLING_INFO_EXTENSION_NAME, true); + test(ext_shader_stencil_export, VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME, true); + test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); + test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false); + test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); + test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); if (Settings::values.renderer_debug) { - Test(extension, nv_device_diagnostics_config, - VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, true); + test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, + true); } } - VkPhysicalDeviceFeatures2KHR features; features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; - VkPhysicalDeviceProperties2KHR properties; - properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; + VkPhysicalDeviceProperties2KHR physical_properties; + physical_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; if (has_khr_shader_float16_int8) { VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8_features; @@ -628,7 +669,6 @@ std::vector<const char*> VKDevice::LoadExtensions() { is_float16_supported = float16_int8_features.shaderFloat16; extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } - if (has_ext_subgroup_size_control) { VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_features; subgroup_features.sType = @@ -641,8 +681,8 @@ std::vector<const char*> VKDevice::LoadExtensions() { subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT; subgroup_properties.pNext = nullptr; - properties.pNext = &subgroup_properties; - physical.GetProperties2KHR(properties); + physical_properties.pNext = &subgroup_properties; + physical.GetProperties2KHR(physical_properties); is_warp_potentially_bigger = subgroup_properties.maxSubgroupSize > GuestWarpSize; @@ -655,7 +695,6 @@ std::vector<const char*> VKDevice::LoadExtensions() { } else { is_warp_potentially_bigger = true; } - if (has_ext_transform_feedback) { VkPhysicalDeviceTransformFeedbackFeaturesEXT tfb_features; tfb_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT; @@ -666,8 +705,8 @@ std::vector<const char*> VKDevice::LoadExtensions() { VkPhysicalDeviceTransformFeedbackPropertiesEXT tfb_properties; tfb_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT; tfb_properties.pNext = nullptr; - properties.pNext = &tfb_properties; - physical.GetProperties2KHR(properties); + physical_properties.pNext = &tfb_properties; + physical.GetProperties2KHR(physical_properties); if (tfb_features.transformFeedback && tfb_features.geometryStreams && tfb_properties.maxTransformFeedbackStreams >= 4 && @@ -677,7 +716,6 @@ std::vector<const char*> VKDevice::LoadExtensions() { ext_transform_feedback = true; } } - if (has_ext_custom_border_color) { VkPhysicalDeviceCustomBorderColorFeaturesEXT border_features; border_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT; @@ -690,13 +728,7 @@ std::vector<const char*> VKDevice::LoadExtensions() { ext_custom_border_color = true; } } - - if (has_ext_extended_dynamic_state && driver_id == VK_DRIVER_ID_AMD_PROPRIETARY) { - // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but the <stride> field - // seems to be bugged. Blacklisting it for now. - LOG_WARNING(Render_Vulkan, - "Blacklisting AMD proprietary from VK_EXT_extended_dynamic_state"); - } else if (has_ext_extended_dynamic_state) { + if (has_ext_extended_dynamic_state) { VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state; dynamic_state.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT; dynamic_state.pNext = nullptr; @@ -708,52 +740,64 @@ std::vector<const char*> VKDevice::LoadExtensions() { ext_extended_dynamic_state = true; } } - return extensions; } -void VKDevice::SetupFamilies(VkSurfaceKHR surface) { - std::optional<u32> graphics_family_, present_family_; - +void Device::SetupFamilies(VkSurfaceKHR surface) { const std::vector queue_family_properties = physical.GetQueueFamilyProperties(); - for (u32 i = 0; i < static_cast<u32>(queue_family_properties.size()); ++i) { - if (graphics_family_ && present_family_) + std::optional<u32> graphics; + std::optional<u32> present; + for (u32 index = 0; index < static_cast<u32>(queue_family_properties.size()); ++index) { + if (graphics && (present || !surface)) { break; - - const auto& queue_family = queue_family_properties[i]; - if (queue_family.queueCount == 0) + } + const VkQueueFamilyProperties& queue_family = queue_family_properties[index]; + if (queue_family.queueCount == 0) { continue; - + } if (queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) { - graphics_family_ = i; + graphics = index; } - if (physical.GetSurfaceSupportKHR(i, surface)) { - present_family_ = i; + if (surface && physical.GetSurfaceSupportKHR(index, surface)) { + present = index; } } - ASSERT(graphics_family_ && present_family_); - - graphics_family = *graphics_family_; - present_family = *present_family_; + if (!graphics) { + LOG_ERROR(Render_Vulkan, "Device lacks a graphics queue"); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } + if (surface && !present) { + LOG_ERROR(Render_Vulkan, "Device lacks a present queue"); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } + graphics_family = *graphics; + present_family = *present; } -void VKDevice::SetupFeatures() { +void Device::SetupFeatures() { const auto supported_features{physical.GetFeatures()}; is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat; + is_shader_storage_image_multisample = supported_features.shaderStorageImageMultisample; + is_blit_depth_stencil_supported = TestDepthStencilBlits(); is_optimal_astc_supported = IsOptimalAstcSupported(supported_features); } -void VKDevice::CollectTelemetryParameters() { +void Device::CollectTelemetryParameters() { VkPhysicalDeviceDriverPropertiesKHR driver{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR, .pNext = nullptr, + .driverID = {}, + .driverName = {}, + .driverInfo = {}, + .conformanceVersion = {}, }; - VkPhysicalDeviceProperties2KHR properties{ + VkPhysicalDeviceProperties2KHR device_properties{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, .pNext = &driver, + .properties = {}, }; - physical.GetProperties2KHR(properties); + physical.GetProperties2KHR(device_properties); driver_id = driver.driverID; vendor_name = driver.driverName; @@ -765,7 +809,33 @@ void VKDevice::CollectTelemetryParameters() { } } -std::vector<VkDeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const { +void Device::CollectToolingInfo() { + if (!ext_tooling_info) { + return; + } + const auto vkGetPhysicalDeviceToolPropertiesEXT = + reinterpret_cast<PFN_vkGetPhysicalDeviceToolPropertiesEXT>( + dld.vkGetInstanceProcAddr(instance, "vkGetPhysicalDeviceToolPropertiesEXT")); + if (!vkGetPhysicalDeviceToolPropertiesEXT) { + return; + } + u32 tool_count = 0; + if (vkGetPhysicalDeviceToolPropertiesEXT(physical, &tool_count, nullptr) != VK_SUCCESS) { + return; + } + std::vector<VkPhysicalDeviceToolPropertiesEXT> tools(tool_count); + if (vkGetPhysicalDeviceToolPropertiesEXT(physical, &tool_count, tools.data()) != VK_SUCCESS) { + return; + } + for (const VkPhysicalDeviceToolPropertiesEXT& tool : tools) { + const std::string_view name = tool.name; + LOG_INFO(Render_Vulkan, "{}", name); + has_renderdoc = has_renderdoc || name == "RenderDoc"; + has_nsight_graphics = has_nsight_graphics || name == "NVIDIA Nsight Graphics"; + } +} + +std::vector<VkDeviceQueueCreateInfo> Device::GetDeviceQueueCreateInfos() const { static constexpr float QUEUE_PRIORITY = 1.0f; std::unordered_set<u32> unique_queue_families{graphics_family, present_family}; diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/vulkan_common/vulkan_device.h index 26a233db1..67d70cd22 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -10,11 +10,12 @@ #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class NsightAftermathTracker; + /// Format usage descriptor. enum class FormatType { Linear, Optimal, Buffer }; @@ -22,14 +23,11 @@ enum class FormatType { Linear, Optimal, Buffer }; const u32 GuestWarpSize = 32; /// Handles data specific to a physical device. -class VKDevice final { +class Device { public: - explicit VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld); - ~VKDevice(); - - /// Initializes the device. Returns true on success. - bool Create(); + explicit Device(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, + const vk::InstanceDispatch& dld); + ~Device(); /** * Returns a format supported by the device for the passed requeriments. @@ -83,7 +81,7 @@ public: } /// Returns the current Vulkan API version provided in Vulkan-formatted version numbers. - u32 GetApiVersion() const { + u32 ApiVersion() const { return properties.apiVersion; } @@ -152,6 +150,11 @@ public: return is_formatless_image_load_supported; } + /// Returns true when blitting from and to depth stencil images is supported. + bool IsBlitDepthStencilSupported() const { + return is_blit_depth_stencil_supported; + } + /// Returns true if the device supports VK_NV_viewport_swizzle. bool IsNvViewportSwizzleSupported() const { return nv_viewport_swizzle; @@ -167,6 +170,11 @@ public: return ext_index_type_uint8; } + /// Returns true if the device supports VK_EXT_sampler_filter_minmax. + bool IsExtSamplerFilterMinmaxSupported() const { + return ext_sampler_filter_minmax; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return ext_depth_range_unrestricted; @@ -192,6 +200,16 @@ public: return ext_extended_dynamic_state; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return ext_shader_stencil_export; + } + + /// Returns true when a known debugging tool is attached. + bool HasDebuggingToolAttached() const { + return has_renderdoc || has_nsight_graphics; + } + /// Returns the vendor name reported from Vulkan. std::string_view GetVendorName() const { return vendor_name; @@ -207,12 +225,12 @@ public: return use_asynchronous_shaders; } +private: /// Checks if the physical device is suitable. - static bool IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface); + void CheckSuitability(bool requires_swapchain) const; -private: /// Loads extensions into a vector and stores available ones in this object. - std::vector<const char*> LoadExtensions(); + std::vector<const char*> LoadExtensions(bool requires_surface); /// Sets up queue families. void SetupFamilies(VkSurfaceKHR surface); @@ -223,22 +241,30 @@ private: /// Collects telemetry information from the device. void CollectTelemetryParameters(); + /// Collects information about attached tools. + void CollectToolingInfo(); + /// Returns a list of queue initialization descriptors. std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; /// Returns true if ASTC textures are natively supported. bool IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const; + /// Returns true if the device natively supports blitting depth stencil images. + bool TestDepthStencilBlits() const; + /// Returns true if a format is supported. bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const; + VkInstance instance; ///< Vulkan instance. vk::DeviceDispatch dld; ///< Device function pointers. vk::PhysicalDevice physical; ///< Physical device. VkPhysicalDeviceProperties properties; ///< Device properties. vk::Device logical; ///< Logical device. vk::Queue graphics_queue; ///< Main graphics queue. vk::Queue present_queue; ///< Main present queue. + u32 instance_version{}; ///< Vulkan onstance version. u32 graphics_family{}; ///< Main graphics queue family index. u32 present_family{}; ///< Main present queue family index. VkDriverIdKHR driver_id{}; ///< Driver ID. @@ -246,16 +272,23 @@ private: bool is_optimal_astc_supported{}; ///< Support for native ASTC. bool is_float16_supported{}; ///< Support for float16 arithmetics. bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. - bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. - bool nv_viewport_swizzle{}; ///< Support for VK_NV_viewport_swizzle. - bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. - bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. - bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. - bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. - bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. - bool ext_custom_border_color{}; ///< Support for VK_EXT_custom_border_color. - bool ext_extended_dynamic_state{}; ///< Support for VK_EXT_extended_dynamic_state. - bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config. + bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. + bool is_shader_storage_image_multisample{}; ///< Support for image operations on MSAA images. + bool is_blit_depth_stencil_supported{}; ///< Support for blitting from and to depth stencil. + bool nv_viewport_swizzle{}; ///< Support for VK_NV_viewport_swizzle. + bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. + bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. + bool ext_sampler_filter_minmax{}; ///< Support for VK_EXT_sampler_filter_minmax. + bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. + bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. + bool ext_tooling_info{}; ///< Support for VK_EXT_tooling_info. + bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. + bool ext_custom_border_color{}; ///< Support for VK_EXT_custom_border_color. + bool ext_extended_dynamic_state{}; ///< Support for VK_EXT_extended_dynamic_state. + bool ext_shader_stencil_export{}; ///< Support for VK_EXT_shader_stencil_export. + bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config. + bool has_renderdoc{}; ///< Has RenderDoc attached + bool has_nsight_graphics{}; ///< Has Nsight Graphics attached // Asynchronous Graphics Pipeline setting bool use_asynchronous_shaders{}; ///< Setting to use asynchronous shaders/graphics pipeline @@ -268,7 +301,7 @@ private: std::unordered_map<VkFormat, VkFormatProperties> format_properties; /// Nsight Aftermath GPU crash tracker - NsightAftermathTracker nsight_aftermath_tracker; + std::unique_ptr<NsightAftermathTracker> nsight_aftermath_tracker; }; } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp new file mode 100644 index 000000000..bfd6e6add --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -0,0 +1,155 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <future> +#include <optional> +#include <span> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "common/dynamic_library.h" +#include "common/logging/log.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +// Include these late to avoid polluting previous headers +#ifdef _WIN32 +#include <windows.h> +// ensure include order +#include <vulkan/vulkan_win32.h> +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#include <X11/Xlib.h> +#include <vulkan/vulkan_wayland.h> +#include <vulkan/vulkan_xlib.h> +#endif + +namespace Vulkan { +namespace { +[[nodiscard]] std::vector<const char*> RequiredExtensions( + Core::Frontend::WindowSystemType window_type, bool enable_debug_utils) { + std::vector<const char*> extensions; + extensions.reserve(6); + switch (window_type) { + case Core::Frontend::WindowSystemType::Headless: + break; +#ifdef _WIN32 + case Core::Frontend::WindowSystemType::Windows: + extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME); + break; +#endif +#if !defined(_WIN32) && !defined(__APPLE__) + case Core::Frontend::WindowSystemType::X11: + extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME); + break; + case Core::Frontend::WindowSystemType::Wayland: + extensions.push_back(VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME); + break; +#endif + default: + LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); + break; + } + if (window_type != Core::Frontend::WindowSystemType::Headless) { + extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); + } + if (enable_debug_utils) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + return extensions; +} + +[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, + std::span<const char* const> extensions) { + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + if (!properties) { + LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); + return false; + } + for (const char* extension : extensions) { + const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { + return std::strcmp(extension, prop.extensionName) == 0; + }); + if (it == properties->end()) { + LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); + return false; + } + } + return true; +} + +[[nodiscard]] std::vector<const char*> Layers(bool enable_layers) { + std::vector<const char*> layers; + if (enable_layers) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + return layers; +} + +void RemoveUnavailableLayers(const vk::InstanceDispatch& dld, std::vector<const char*>& layers) { + const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); + if (!layer_properties) { + LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); + layers.clear(); + } + std::erase_if(layers, [&layer_properties](const char* layer) { + const auto comp = [layer](const VkLayerProperties& layer_property) { + return std::strcmp(layer, layer_property.layerName) == 0; + }; + const auto it = std::ranges::find_if(*layer_properties, comp); + if (it == layer_properties->end()) { + LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); + return true; + } + return false; + }); +} +} // Anonymous namespace + +vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceDispatch& dld, + u32 required_version, Core::Frontend::WindowSystemType window_type, + bool enable_debug_utils, bool enable_layers) { + if (!library.IsOpen()) { + LOG_ERROR(Render_Vulkan, "Vulkan library not available"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + if (!library.GetSymbol("vkGetInstanceProcAddr", &dld.vkGetInstanceProcAddr)) { + LOG_ERROR(Render_Vulkan, "vkGetInstanceProcAddr not present in Vulkan"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + if (!vk::Load(dld)) { + LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + const std::vector<const char*> extensions = RequiredExtensions(window_type, enable_debug_utils); + if (!AreExtensionsSupported(dld, extensions)) { + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); + } + std::vector<const char*> layers = Layers(enable_layers); + RemoveUnavailableLayers(dld, layers); + + const u32 available_version = vk::AvailableVersion(dld); + if (available_version < required_version) { + LOG_ERROR(Render_Vulkan, "Vulkan {}.{} is not supported, {}.{} is required", + VK_VERSION_MAJOR(available_version), VK_VERSION_MINOR(available_version), + VK_VERSION_MAJOR(required_version), VK_VERSION_MINOR(required_version)); + throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); + } + vk::Instance instance = + std::async([&] { + return vk::Instance::Create(required_version, layers, extensions, dld); + }).get(); + if (!vk::Load(*instance, dld)) { + LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + return instance; +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_instance.h b/src/video_core/vulkan_common/vulkan_instance.h new file mode 100644 index 000000000..e5e3a7144 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_instance.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" +#include "common/dynamic_library.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +/** + * Create a Vulkan instance + * + * @param library Dynamic library to load the Vulkan instance from + * @param dld Dispatch table to load function pointers into + * @param required_version Required Vulkan version (for example, VK_API_VERSION_1_1) + * @param window_type Window system type's enabled extension + * @param enable_debug_utils Whether to enable VK_EXT_debug_utils_extension_name or not + * @param enable_layers Whether to enable Vulkan validation layers or not + * + * @return A new Vulkan instance + * @throw vk::Exception on failure + */ +[[nodiscard]] vk::Instance CreateInstance( + const Common::DynamicLibrary& library, vk::InstanceDispatch& dld, u32 required_version, + Core::Frontend::WindowSystemType window_type = Core::Frontend::WindowSystemType::Headless, + bool enable_debug_utils = false, bool enable_layers = false); + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_library.cpp b/src/video_core/vulkan_common/vulkan_library.cpp new file mode 100644 index 000000000..557871d81 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_library.cpp @@ -0,0 +1,36 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstdlib> +#include <string> + +#include "common/dynamic_library.h" +#include "common/file_util.h" +#include "video_core/vulkan_common/vulkan_library.h" + +namespace Vulkan { + +Common::DynamicLibrary OpenLibrary() { + Common::DynamicLibrary library; +#ifdef __APPLE__ + // Check if a path to a specific Vulkan library has been specified. + char* const libvulkan_env = std::getenv("LIBVULKAN_PATH"); + if (!libvulkan_env || !library.Open(libvulkan_env)) { + // Use the libvulkan.dylib from the application bundle. + const std::string filename = + Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; + void(library.Open(filename.c_str())); + } +#else + std::string filename = Common::DynamicLibrary::GetVersionedFilename("vulkan", 1); + if (!library.Open(filename.c_str())) { + // Android devices may not have libvulkan.so.1, only libvulkan.so. + filename = Common::DynamicLibrary::GetVersionedFilename("vulkan"); + void(library.Open(filename.c_str())); + } +#endif + return library; +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_library.h b/src/video_core/vulkan_common/vulkan_library.h new file mode 100644 index 000000000..8b28b0e17 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_library.h @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/dynamic_library.h" + +namespace Vulkan { + +Common::DynamicLibrary OpenLibrary(); + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp new file mode 100644 index 000000000..2a8b7a907 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -0,0 +1,326 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <bit> +#include <optional> +#include <vector> + +#include <glad/glad.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { +namespace { +struct Range { + u64 begin; + u64 end; + + [[nodiscard]] bool Contains(u64 iterator, u64 size) const noexcept { + return iterator < end && begin < iterator + size; + } +}; + +[[nodiscard]] u64 AllocationChunkSize(u64 required_size) { + static constexpr std::array sizes{ + 0x1000ULL << 10, 0x1400ULL << 10, 0x1800ULL << 10, 0x1c00ULL << 10, 0x2000ULL << 10, + 0x3200ULL << 10, 0x4000ULL << 10, 0x6000ULL << 10, 0x8000ULL << 10, 0xA000ULL << 10, + 0x10000ULL << 10, 0x18000ULL << 10, 0x20000ULL << 10, + }; + static_assert(std::is_sorted(sizes.begin(), sizes.end())); + + const auto it = std::ranges::lower_bound(sizes, required_size); + return it != sizes.end() ? *it : Common::AlignUp(required_size, 4ULL << 20); +} + +[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePropertyFlags(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + case MemoryUsage::Upload: + return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + case MemoryUsage::Download: + return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + UNREACHABLE_MSG("Invalid memory usage={}", usage); + return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; +} +} // Anonymous namespace + +class MemoryAllocation { +public: + explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties, + u64 allocation_size_, u32 type) + : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, + shifted_memory_type{1U << type} {} + +#if defined(_WIN32) || defined(__linux__) + ~MemoryAllocation() { + if (owning_opengl_handle != 0) { + glDeleteMemoryObjectsEXT(1, &owning_opengl_handle); + } + } +#endif + + MemoryAllocation& operator=(const MemoryAllocation&) = delete; + MemoryAllocation(const MemoryAllocation&) = delete; + + MemoryAllocation& operator=(MemoryAllocation&&) = delete; + MemoryAllocation(MemoryAllocation&&) = delete; + + [[nodiscard]] std::optional<MemoryCommit> Commit(VkDeviceSize size, VkDeviceSize alignment) { + const std::optional<u64> alloc = FindFreeRegion(size, alignment); + if (!alloc) { + // Signal out of memory, it'll try to do more allocations. + return std::nullopt; + } + const Range range{ + .begin = *alloc, + .end = *alloc + size, + }; + commits.insert(std::ranges::upper_bound(commits, *alloc, {}, &Range::begin), range); + return std::make_optional<MemoryCommit>(this, *memory, *alloc, *alloc + size); + } + + void Free(u64 begin) { + const auto it = std::ranges::find(commits, begin, &Range::begin); + ASSERT_MSG(it != commits.end(), "Invalid commit"); + commits.erase(it); + } + + [[nodiscard]] std::span<u8> Map() { + if (memory_mapped_span.empty()) { + u8* const raw_pointer = memory.Map(0, allocation_size); + memory_mapped_span = std::span<u8>(raw_pointer, allocation_size); + } + return memory_mapped_span; + } + +#ifdef _WIN32 + [[nodiscard]] u32 ExportOpenGLHandle() { + if (!owning_opengl_handle) { + glCreateMemoryObjectsEXT(1, &owning_opengl_handle); + glImportMemoryWin32HandleEXT(owning_opengl_handle, allocation_size, + GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, + memory.GetMemoryWin32HandleKHR()); + } + return owning_opengl_handle; + } +#elif __linux__ + [[nodiscard]] u32 ExportOpenGLHandle() { + if (!owning_opengl_handle) { + glCreateMemoryObjectsEXT(1, &owning_opengl_handle); + glImportMemoryFdEXT(owning_opengl_handle, allocation_size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, + memory.GetMemoryFdKHR()); + } + return owning_opengl_handle; + } +#else + [[nodiscard]] u32 ExportOpenGLHandle() { + return 0; + } +#endif + + /// Returns whether this allocation is compatible with the arguments. + [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const { + return (flags & property_flags) && (type_mask & shifted_memory_type) != 0; + } + +private: + [[nodiscard]] static constexpr u32 ShiftType(u32 type) { + return 1U << type; + } + + [[nodiscard]] std::optional<u64> FindFreeRegion(u64 size, u64 alignment) noexcept { + ASSERT(std::has_single_bit(alignment)); + const u64 alignment_log2 = std::countr_zero(alignment); + std::optional<u64> candidate; + u64 iterator = 0; + auto commit = commits.begin(); + while (iterator + size <= allocation_size) { + candidate = candidate.value_or(iterator); + if (commit == commits.end()) { + break; + } + if (commit->Contains(*candidate, size)) { + candidate = std::nullopt; + } + iterator = Common::AlignUpLog2(commit->end, alignment_log2); + ++commit; + } + return candidate; + } + + const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. + const u64 allocation_size; ///< Size of this allocation. + const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags. + const u32 shifted_memory_type; ///< Shifted Vulkan memory type. + std::vector<Range> commits; ///< All commit ranges done from this allocation. + std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before. +#if defined(_WIN32) || defined(__linux__) + u32 owning_opengl_handle{}; ///< Owning OpenGL memory object handle. +#endif +}; + +MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, + u64 end_) noexcept + : allocation{allocation_}, memory{memory_}, begin{begin_}, end{end_} {} + +MemoryCommit::~MemoryCommit() { + Release(); +} + +MemoryCommit& MemoryCommit::operator=(MemoryCommit&& rhs) noexcept { + Release(); + allocation = std::exchange(rhs.allocation, nullptr); + memory = rhs.memory; + begin = rhs.begin; + end = rhs.end; + span = std::exchange(rhs.span, std::span<u8>{}); + return *this; +} + +MemoryCommit::MemoryCommit(MemoryCommit&& rhs) noexcept + : allocation{std::exchange(rhs.allocation, nullptr)}, memory{rhs.memory}, begin{rhs.begin}, + end{rhs.end}, span{std::exchange(rhs.span, std::span<u8>{})} {} + +std::span<u8> MemoryCommit::Map() { + if (span.empty()) { + span = allocation->Map().subspan(begin, end - begin); + } + return span; +} + +u32 MemoryCommit::ExportOpenGLHandle() const { + return allocation->ExportOpenGLHandle(); +} + +void MemoryCommit::Release() { + if (allocation) { + allocation->Free(begin); + } +} + +MemoryAllocator::MemoryAllocator(const Device& device_, bool export_allocations_) + : device{device_}, properties{device_.GetPhysical().GetMemoryProperties()}, + export_allocations{export_allocations_} {} + +MemoryAllocator::~MemoryAllocator() = default; + +MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) { + // Find the fastest memory flags we can afford with the current requirements + const VkMemoryPropertyFlags flags = MemoryPropertyFlags(requirements.memoryTypeBits, usage); + if (std::optional<MemoryCommit> commit = TryCommit(requirements, flags)) { + return std::move(*commit); + } + // Commit has failed, allocate more memory. + // TODO(Rodrigo): Handle out of memory situations in some way like flushing to guest memory. + AllocMemory(flags, requirements.memoryTypeBits, AllocationChunkSize(requirements.size)); + + // Commit again, this time it won't fail since there's a fresh allocation above. + // If it does, there's a bug. + return TryCommit(requirements, flags).value(); +} + +MemoryCommit MemoryAllocator::Commit(const vk::Buffer& buffer, MemoryUsage usage) { + auto commit = Commit(device.GetLogical().GetBufferMemoryRequirements(*buffer), usage); + buffer.BindMemory(commit.Memory(), commit.Offset()); + return commit; +} + +MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage) { + auto commit = Commit(device.GetLogical().GetImageMemoryRequirements(*image), usage); + image.BindMemory(commit.Memory(), commit.Offset()); + return commit; +} + +void MemoryAllocator::AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { + const u32 type = FindType(flags, type_mask).value(); + const VkExportMemoryAllocateInfo export_allocate_info{ + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, + .pNext = nullptr, +#ifdef _WIN32 + .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, +#elif __linux__ + .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, +#else + .handleTypes = 0, +#endif + }; + vk::DeviceMemory memory = device.GetLogical().AllocateMemory({ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = export_allocations ? &export_allocate_info : nullptr, + .allocationSize = size, + .memoryTypeIndex = type, + }); + allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type)); +} + +std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, + VkMemoryPropertyFlags flags) { + for (auto& allocation : allocations) { + if (!allocation->IsCompatible(flags, requirements.memoryTypeBits)) { + continue; + } + if (auto commit = allocation->Commit(requirements.size, requirements.alignment)) { + return commit; + } + } + return std::nullopt; +} + +VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask, MemoryUsage usage) const { + return MemoryPropertyFlags(type_mask, MemoryUsagePropertyFlags(usage)); +} + +VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask, + VkMemoryPropertyFlags flags) const { + if (FindType(flags, type_mask)) { + // Found a memory type with those requirements + return flags; + } + if (flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) { + // Remove host cached bit in case it's not supported + return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + } + if (flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) { + // Remove device local, if it's not supported by the requested resource + return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + } + UNREACHABLE_MSG("No compatible memory types found"); + return 0; +} + +std::optional<u32> MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 type_mask) const { + for (u32 type_index = 0; type_index < properties.memoryTypeCount; ++type_index) { + const VkMemoryPropertyFlags type_flags = properties.memoryTypes[type_index].propertyFlags; + if ((type_mask & (1U << type_index)) && (type_flags & flags)) { + // The type matches in type and in the wanted properties. + return type_index; + } + } + // Failed to find index + return std::nullopt; +} + +bool IsHostVisible(MemoryUsage usage) noexcept { + switch (usage) { + case MemoryUsage::DeviceLocal: + return false; + case MemoryUsage::Upload: + case MemoryUsage::Download: + return true; + } + UNREACHABLE_MSG("Invalid memory usage={}", usage); + return false; +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h new file mode 100644 index 000000000..d1ce29450 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -0,0 +1,129 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <span> +#include <utility> +#include <vector> +#include "common/common_types.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +class Device; +class MemoryMap; +class MemoryAllocation; + +/// Hints and requirements for the backing memory type of a commit +enum class MemoryUsage { + DeviceLocal, ///< Hints device local usages, fastest memory type to read and write from the GPU + Upload, ///< Requires a host visible memory type optimized for CPU to GPU uploads + Download, ///< Requires a host visible memory type optimized for GPU to CPU readbacks +}; + +/// Ownership handle of a memory commitment. +/// Points to a subregion of a memory allocation. +class MemoryCommit { +public: + explicit MemoryCommit() noexcept = default; + explicit MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, + u64 end_) noexcept; + ~MemoryCommit(); + + MemoryCommit& operator=(MemoryCommit&&) noexcept; + MemoryCommit(MemoryCommit&&) noexcept; + + MemoryCommit& operator=(const MemoryCommit&) = delete; + MemoryCommit(const MemoryCommit&) = delete; + + /// Returns a host visible memory map. + /// It will map the backing allocation if it hasn't been mapped before. + std::span<u8> Map(); + + /// Returns an non-owning OpenGL handle, creating one if it doesn't exist. + u32 ExportOpenGLHandle() const; + + /// Returns the Vulkan memory handler. + VkDeviceMemory Memory() const { + return memory; + } + + /// Returns the start position of the commit relative to the allocation. + VkDeviceSize Offset() const { + return static_cast<VkDeviceSize>(begin); + } + +private: + void Release(); + + MemoryAllocation* allocation{}; ///< Pointer to the large memory allocation. + VkDeviceMemory memory{}; ///< Vulkan device memory handler. + u64 begin{}; ///< Beginning offset in bytes to where the commit exists. + u64 end{}; ///< Offset in bytes where the commit ends. + std::span<u8> span; ///< Host visible memory span. Empty if not queried before. +}; + +/// Memory allocator container. +/// Allocates and releases memory allocations on demand. +class MemoryAllocator { +public: + /** + * Construct memory allocator + * + * @param device_ Device to allocate from + * @param export_allocations_ True when allocations have to be exported + * + * @throw vk::Exception on failure + */ + explicit MemoryAllocator(const Device& device_, bool export_allocations_); + ~MemoryAllocator(); + + MemoryAllocator& operator=(const MemoryAllocator&) = delete; + MemoryAllocator(const MemoryAllocator&) = delete; + + /** + * Commits a memory with the specified requirements. + * + * @param requirements Requirements returned from a Vulkan call. + * @param usage Indicates how the memory will be used. + * + * @returns A memory commit. + */ + MemoryCommit Commit(const VkMemoryRequirements& requirements, MemoryUsage usage); + + /// Commits memory required by the buffer and binds it. + MemoryCommit Commit(const vk::Buffer& buffer, MemoryUsage usage); + + /// Commits memory required by the image and binds it. + MemoryCommit Commit(const vk::Image& image, MemoryUsage usage); + +private: + /// Allocates a chunk of memory. + void AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); + + /// Tries to allocate a memory commit. + std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements, + VkMemoryPropertyFlags flags); + + /// Returns the fastest compatible memory property flags from a wanted usage. + VkMemoryPropertyFlags MemoryPropertyFlags(u32 type_mask, MemoryUsage usage) const; + + /// Returns the fastest compatible memory property flags from the wanted flags. + VkMemoryPropertyFlags MemoryPropertyFlags(u32 type_mask, VkMemoryPropertyFlags flags) const; + + /// Returns index to the fastest memory type compatible with the passed requirements. + std::optional<u32> FindType(VkMemoryPropertyFlags flags, u32 type_mask) const; + + const Device& device; ///< Device handle. + const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. + const bool export_allocations; ///< True when memory allocations have to be exported. + std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations. +}; + +/// Returns true when a memory usage is guaranteed to be host visible. +bool IsHostVisible(MemoryUsage usage) noexcept; + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp new file mode 100644 index 000000000..3c3238f96 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_surface.cpp @@ -0,0 +1,81 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/logging/log.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_surface.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +// Include these late to avoid polluting previous headers +#ifdef _WIN32 +#include <windows.h> +// ensure include order +#include <vulkan/vulkan_win32.h> +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#include <X11/Xlib.h> +#include <vulkan/vulkan_wayland.h> +#include <vulkan/vulkan_xlib.h> +#endif + +namespace Vulkan { + +vk::SurfaceKHR CreateSurface(const vk::Instance& instance, + const Core::Frontend::EmuWindow& emu_window) { + [[maybe_unused]] const vk::InstanceDispatch& dld = instance.Dispatch(); + [[maybe_unused]] const auto& window_info = emu_window.GetWindowInfo(); + VkSurfaceKHR unsafe_surface = nullptr; + +#ifdef _WIN32 + if (window_info.type == Core::Frontend::WindowSystemType::Windows) { + const HWND hWnd = static_cast<HWND>(window_info.render_surface); + const VkWin32SurfaceCreateInfoKHR win32_ci{VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR, + nullptr, 0, nullptr, hWnd}; + const auto vkCreateWin32SurfaceKHR = reinterpret_cast<PFN_vkCreateWin32SurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateWin32SurfaceKHR")); + if (!vkCreateWin32SurfaceKHR || + vkCreateWin32SurfaceKHR(*instance, &win32_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Win32 surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } +#endif +#if !defined(_WIN32) && !defined(__APPLE__) + if (window_info.type == Core::Frontend::WindowSystemType::X11) { + const VkXlibSurfaceCreateInfoKHR xlib_ci{ + VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0, + static_cast<Display*>(window_info.display_connection), + reinterpret_cast<Window>(window_info.render_surface)}; + const auto vkCreateXlibSurfaceKHR = reinterpret_cast<PFN_vkCreateXlibSurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateXlibSurfaceKHR")); + if (!vkCreateXlibSurfaceKHR || + vkCreateXlibSurfaceKHR(*instance, &xlib_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Xlib surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } + if (window_info.type == Core::Frontend::WindowSystemType::Wayland) { + const VkWaylandSurfaceCreateInfoKHR wayland_ci{ + VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR, nullptr, 0, + static_cast<wl_display*>(window_info.display_connection), + static_cast<wl_surface*>(window_info.render_surface)}; + const auto vkCreateWaylandSurfaceKHR = reinterpret_cast<PFN_vkCreateWaylandSurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateWaylandSurfaceKHR")); + if (!vkCreateWaylandSurfaceKHR || + vkCreateWaylandSurfaceKHR(*instance, &wayland_ci, nullptr, &unsafe_surface) != + VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Wayland surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } +#endif + if (!unsafe_surface) { + LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + return vk::SurfaceKHR(unsafe_surface, *instance, dld); +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_surface.h b/src/video_core/vulkan_common/vulkan_surface.h new file mode 100644 index 000000000..05a169e32 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_surface.h @@ -0,0 +1,18 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Core::Frontend { +class EmuWindow; +} + +namespace Vulkan { + +[[nodiscard]] vk::SurfaceKHR CreateSurface(const vk::Instance& instance, + const Core::Frontend::EmuWindow& emu_window); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 013865aa4..2aa0ffbe6 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -6,32 +6,55 @@ #include <exception> #include <memory> #include <optional> +#include <string_view> #include <utility> #include <vector> #include "common/common_types.h" +#include "common/logging/log.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::vk { namespace { +template <typename Func> +void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld, + Func&& func) { + // Calling GetProperties calls Vulkan more than needed. But they are supposed to be cheap + // functions. + std::stable_sort(devices.begin(), devices.end(), + [&dld, &func](VkPhysicalDevice lhs, VkPhysicalDevice rhs) { + return func(vk::PhysicalDevice(lhs, dld).GetProperties(), + vk::PhysicalDevice(rhs, dld).GetProperties()); + }); +} + +void SortPhysicalDevicesPerVendor(std::vector<VkPhysicalDevice>& devices, + const InstanceDispatch& dld, + std::initializer_list<u32> vendor_ids) { + for (auto it = vendor_ids.end(); it != vendor_ids.begin();) { + --it; + SortPhysicalDevices(devices, dld, [id = *it](const auto& lhs, const auto& rhs) { + return lhs.vendorID == id && rhs.vendorID != id; + }); + } +} + void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) { - std::stable_sort(devices.begin(), devices.end(), [&](auto lhs, auto rhs) { - // This will call Vulkan more than needed, but these calls are cheap. - const auto lhs_properties = vk::PhysicalDevice(lhs, dld).GetProperties(); - const auto rhs_properties = vk::PhysicalDevice(rhs, dld).GetProperties(); - - // Prefer discrete GPUs, Nvidia over AMD, AMD over Intel, Intel over the rest. - const bool preferred = - (lhs_properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU && - rhs_properties.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) || - (lhs_properties.vendorID == 0x10DE && rhs_properties.vendorID != 0x10DE) || - (lhs_properties.vendorID == 0x1002 && rhs_properties.vendorID != 0x1002) || - (lhs_properties.vendorID == 0x8086 && rhs_properties.vendorID != 0x8086); - return !preferred; + // Sort by name, this will set a base and make GPUs with higher numbers appear first + // (e.g. GTX 1650 will intentionally be listed before a GTX 1080). + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return std::string_view{lhs.deviceName} > std::string_view{rhs.deviceName}; }); + // Prefer discrete over non-discrete + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return lhs.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU && + rhs.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU; + }); + // Prefer Nvidia over AMD, AMD over Intel, Intel over the rest. + SortPhysicalDevicesPerVendor(devices, dld, {0x10DE, 0x1002, 0x8086}); } template <typename T> @@ -58,6 +81,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdBeginQuery); X(vkCmdBeginRenderPass); X(vkCmdBeginTransformFeedbackEXT); + X(vkCmdBeginDebugUtilsLabelEXT); X(vkCmdBindDescriptorSets); X(vkCmdBindIndexBuffer); X(vkCmdBindPipeline); @@ -75,6 +99,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); + X(vkCmdEndDebugUtilsLabelEXT); X(vkCmdFillBuffer); X(vkCmdPipelineBarrier); X(vkCmdPushConstants); @@ -98,6 +123,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdSetPrimitiveTopologyEXT); X(vkCmdSetStencilOpEXT); X(vkCmdSetStencilTestEnableEXT); + X(vkCmdResolveImage); X(vkCreateBuffer); X(vkCreateBufferView); X(vkCreateCommandPool); @@ -142,23 +168,44 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkFreeCommandBuffers); X(vkFreeDescriptorSets); X(vkFreeMemory); - X(vkGetBufferMemoryRequirements); + X(vkGetBufferMemoryRequirements2); X(vkGetDeviceQueue); X(vkGetEventStatus); X(vkGetFenceStatus); X(vkGetImageMemoryRequirements); + X(vkGetMemoryFdKHR); +#ifdef _WIN32 + X(vkGetMemoryWin32HandleKHR); +#endif X(vkGetQueryPoolResults); + X(vkGetSemaphoreCounterValueKHR); X(vkMapMemory); X(vkQueueSubmit); X(vkResetFences); X(vkResetQueryPoolEXT); + X(vkSetDebugUtilsObjectNameEXT); + X(vkSetDebugUtilsObjectTagEXT); X(vkUnmapMemory); X(vkUpdateDescriptorSetWithTemplateKHR); X(vkUpdateDescriptorSets); X(vkWaitForFences); + X(vkWaitSemaphoresKHR); #undef X } +template <typename T> +void SetObjectName(const DeviceDispatch* dld, VkDevice device, T handle, VkObjectType type, + const char* name) { + const VkDebugUtilsObjectNameInfoEXT name_info{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, + .pNext = nullptr, + .objectType = VK_OBJECT_TYPE_IMAGE, + .objectHandle = reinterpret_cast<u64>(handle), + .pObjectName = name, + }; + Check(dld->vkSetDebugUtilsObjectNameEXT(device, &name_info)); +} + } // Anonymous namespace bool Load(InstanceDispatch& dld) noexcept { @@ -262,6 +309,22 @@ const char* ToString(VkResult result) noexcept { return "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT"; case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT: return "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT"; + case VkResult::VK_ERROR_UNKNOWN: + return "VK_ERROR_UNKNOWN"; + case VkResult::VK_ERROR_INCOMPATIBLE_VERSION_KHR: + return "VK_ERROR_INCOMPATIBLE_VERSION_KHR"; + case VkResult::VK_THREAD_IDLE_KHR: + return "VK_THREAD_IDLE_KHR"; + case VkResult::VK_THREAD_DONE_KHR: + return "VK_THREAD_DONE_KHR"; + case VkResult::VK_OPERATION_DEFERRED_KHR: + return "VK_OPERATION_DEFERRED_KHR"; + case VkResult::VK_OPERATION_NOT_DEFERRED_KHR: + return "VK_OPERATION_NOT_DEFERRED_KHR"; + case VkResult::VK_PIPELINE_COMPILE_REQUIRED_EXT: + return "VK_PIPELINE_COMPILE_REQUIRED_EXT"; + case VkResult::VK_RESULT_MAX_ENUM: + return "VK_RESULT_MAX_ENUM"; } return "Unknown"; } @@ -375,18 +438,17 @@ VkResult Free(VkDevice device, VkCommandPool handle, Span<VkCommandBuffer> buffe return VK_SUCCESS; } -Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions, - InstanceDispatch& dld) noexcept { - static constexpr VkApplicationInfo application_info{ +Instance Instance::Create(u32 version, Span<const char*> layers, Span<const char*> extensions, + InstanceDispatch& dispatch) { + const VkApplicationInfo application_info{ .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .pNext = nullptr, .pApplicationName = "yuzu Emulator", .applicationVersion = VK_MAKE_VERSION(0, 1, 0), .pEngineName = "yuzu Emulator", .engineVersion = VK_MAKE_VERSION(0, 1, 0), - .apiVersion = VK_API_VERSION_1_1, + .apiVersion = version, }; - const VkInstanceCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pNext = nullptr, @@ -397,66 +459,94 @@ Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions .enabledExtensionCount = extensions.size(), .ppEnabledExtensionNames = extensions.data(), }; - VkInstance instance; - if (dld.vkCreateInstance(&ci, nullptr, &instance) != VK_SUCCESS) { - // Failed to create the instance. - return {}; - } - if (!Proc(dld.vkDestroyInstance, dld, "vkDestroyInstance", instance)) { + Check(dispatch.vkCreateInstance(&ci, nullptr, &instance)); + if (!Proc(dispatch.vkDestroyInstance, dispatch, "vkDestroyInstance", instance)) { // We successfully created an instance but the destroy function couldn't be loaded. // This is a good moment to panic. - return {}; + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - - return Instance(instance, dld); + return Instance(instance, dispatch); } -std::optional<std::vector<VkPhysicalDevice>> Instance::EnumeratePhysicalDevices() { +std::vector<VkPhysicalDevice> Instance::EnumeratePhysicalDevices() const { u32 num; - if (dld->vkEnumeratePhysicalDevices(handle, &num, nullptr) != VK_SUCCESS) { - return std::nullopt; - } + Check(dld->vkEnumeratePhysicalDevices(handle, &num, nullptr)); std::vector<VkPhysicalDevice> physical_devices(num); - if (dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data()) != VK_SUCCESS) { - return std::nullopt; - } + Check(dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data())); SortPhysicalDevices(physical_devices, *dld); - return std::make_optional(std::move(physical_devices)); + return physical_devices; } -DebugCallback Instance::TryCreateDebugCallback( - PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept { - const VkDebugUtilsMessengerCreateInfoEXT ci{ - .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, - .pNext = nullptr, - .flags = 0, - .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT, - .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, - .pfnUserCallback = callback, - .pUserData = nullptr, - }; - - VkDebugUtilsMessengerEXT messenger; - if (dld->vkCreateDebugUtilsMessengerEXT(handle, &ci, nullptr, &messenger) != VK_SUCCESS) { - return {}; - } - return DebugCallback(messenger, handle, *dld); +DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( + const VkDebugUtilsMessengerCreateInfoEXT& create_info) const { + VkDebugUtilsMessengerEXT object; + Check(dld->vkCreateDebugUtilsMessengerEXT(handle, &create_info, nullptr, &object)); + return DebugUtilsMessenger(object, handle, *dld); } void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); } +void Buffer::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER, name); +} + +void BufferView::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); +} + void Image::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindImageMemory(owner, handle, memory, offset)); } +void Image::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); +} + +void ImageView::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name); +} + +int DeviceMemory::GetMemoryFdKHR() const { + const VkMemoryGetFdInfoKHR get_fd_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .pNext = nullptr, + .memory = handle, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR, + }; + int fd; + Check(dld->vkGetMemoryFdKHR(owner, &get_fd_info, &fd)); + return fd; +} + +#ifdef _WIN32 +HANDLE DeviceMemory::GetMemoryWin32HandleKHR() const { + const VkMemoryGetWin32HandleInfoKHR get_win32_handle_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .pNext = nullptr, + .memory = handle, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR, + }; + HANDLE win32_handle; + Check(dld->vkGetMemoryWin32HandleKHR(owner, &get_win32_handle_info, &win32_handle)); + return win32_handle; +} +#endif + +void DeviceMemory::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DEVICE_MEMORY, name); +} + +void Fence::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_FENCE, name); +} + +void Framebuffer::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_FRAMEBUFFER, name); +} + DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) const { const std::size_t num = ai.descriptorSetCount; std::unique_ptr sets = std::make_unique<VkDescriptorSet[]>(num); @@ -470,6 +560,10 @@ DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) c } } +void DescriptorPool::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DESCRIPTOR_POOL, name); +} + CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLevel level) const { const VkCommandBufferAllocateInfo ai{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, @@ -490,6 +584,10 @@ CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLev } } +void CommandPool::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_COMMAND_POOL, name); +} + std::vector<VkImage> SwapchainKHR::GetImages() const { u32 num; Check(dld->vkGetSwapchainImagesKHR(owner, handle, &num, nullptr)); @@ -498,9 +596,21 @@ std::vector<VkImage> SwapchainKHR::GetImages() const { return images; } +void Event::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_EVENT, name); +} + +void ShaderModule::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_SHADER_MODULE, name); +} + +void Semaphore::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_SEMAPHORE, name); +} + Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, Span<const char*> enabled_extensions, const void* next, - DeviceDispatch& dld) noexcept { + DeviceDispatch& dispatch) { const VkDeviceCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .pNext = next, @@ -513,13 +623,10 @@ Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreate .ppEnabledExtensionNames = enabled_extensions.data(), .pEnabledFeatures = nullptr, }; - VkDevice device; - if (dld.vkCreateDevice(physical_device, &ci, nullptr, &device) != VK_SUCCESS) { - return {}; - } - Load(device, dld); - return Device(device, dld); + Check(dispatch.vkCreateDevice(physical_device, &ci, nullptr, &device)); + Load(device, dispatch); + return Device(device, dispatch); } Queue Device::GetQueue(u32 family_index) const noexcept { @@ -558,7 +665,10 @@ Semaphore Device::CreateSemaphore() const { .pNext = nullptr, .flags = 0, }; + return CreateSemaphore(ci); +} +Semaphore Device::CreateSemaphore(const VkSemaphoreCreateInfo& ci) const { VkSemaphore object; Check(dld->vkCreateSemaphore(handle, &ci, nullptr, &object)); return Semaphore(object, handle, *dld); @@ -644,7 +754,7 @@ ShaderModule Device::CreateShaderModule(const VkShaderModuleCreateInfo& ci) cons return ShaderModule(object, handle, *dld); } -Event Device::CreateNewEvent() const { +Event Device::CreateEvent() const { static constexpr VkEventCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO, .pNext = nullptr, @@ -676,10 +786,20 @@ DeviceMemory Device::AllocateMemory(const VkMemoryAllocateInfo& ai) const { return DeviceMemory(memory, handle, *dld); } -VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer) const noexcept { - VkMemoryRequirements requirements; - dld->vkGetBufferMemoryRequirements(handle, buffer, &requirements); - return requirements; +VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer, + void* pnext) const noexcept { + const VkBufferMemoryRequirementsInfo2 info{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, + .pNext = nullptr, + .buffer = buffer, + }; + VkMemoryRequirements2 requirements{ + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + .pNext = pnext, + .memoryRequirements{}, + }; + dld->vkGetBufferMemoryRequirements2(handle, &info, &requirements); + return requirements.memoryRequirements; } VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept { @@ -775,6 +895,21 @@ VkPhysicalDeviceMemoryProperties PhysicalDevice::GetMemoryProperties() const noe return properties; } +u32 AvailableVersion(const InstanceDispatch& dld) noexcept { + PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion; + if (!Proc(vkEnumerateInstanceVersion, dld, "vkEnumerateInstanceVersion")) { + // If the procedure is not found, Vulkan 1.0 is assumed + return VK_API_VERSION_1_0; + } + u32 version; + if (const VkResult result = vkEnumerateInstanceVersion(&version); result != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "vkEnumerateInstanceVersion returned {}, assuming Vulkan 1.1", + ToString(result)); + return VK_API_VERSION_1_1; + } + return version; +} + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld) { u32 num; @@ -786,7 +921,7 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp VK_SUCCESS) { return std::nullopt; } - return std::move(properties); + return properties; } std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index b9d3fedc1..3e36d356a 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -9,15 +9,31 @@ #include <limits> #include <memory> #include <optional> +#include <span> #include <type_traits> #include <utility> #include <vector> #define VK_NO_PROTOTYPES +#ifdef _WIN32 +#define VK_USE_PLATFORM_WIN32_KHR +#endif #include <vulkan/vulkan.h> +// Sanitize macros +#ifdef CreateEvent +#undef CreateEvent +#endif +#ifdef CreateSemaphore +#undef CreateSemaphore +#endif + #include "common/common_types.h" +#ifdef _MSC_VER +#pragma warning(disable : 26812) // Disable prefer enum class over enum +#endif + namespace Vulkan::vk { /** @@ -41,6 +57,9 @@ public: /// Construct an empty span. constexpr Span() noexcept = default; + /// Construct an empty span + constexpr Span(std::nullptr_t) noexcept {} + /// Construct a span from a single element. constexpr Span(const T& value) noexcept : ptr{&value}, num{1} {} @@ -52,7 +71,7 @@ public: /// Construct a span from a pointer and a size. /// This is inteded for subranges. - constexpr Span(const T* ptr, std::size_t num) noexcept : ptr{ptr}, num{num} {} + constexpr Span(const T* ptr_, std::size_t num_) noexcept : ptr{ptr_}, num{num_} {} /// Returns the data pointer by the span. constexpr const T* data() const noexcept { @@ -136,145 +155,156 @@ inline VkResult Filter(VkResult result) { /// Table holding Vulkan instance function pointers. struct InstanceDispatch { - PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr; - - PFN_vkCreateInstance vkCreateInstance; - PFN_vkDestroyInstance vkDestroyInstance; - PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties; - PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties; - - PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT; - PFN_vkCreateDevice vkCreateDevice; - PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT; - PFN_vkDestroyDevice vkDestroyDevice; - PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR; - PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties; - PFN_vkEnumeratePhysicalDevices vkEnumeratePhysicalDevices; - PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr; - PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR; - PFN_vkGetPhysicalDeviceFormatProperties vkGetPhysicalDeviceFormatProperties; - PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties; - PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties; - PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR; - PFN_vkGetPhysicalDeviceQueueFamilyProperties vkGetPhysicalDeviceQueueFamilyProperties; - PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR; - PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR; - PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR; - PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR; - PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR; - PFN_vkQueuePresentKHR vkQueuePresentKHR; + PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{}; + + PFN_vkCreateInstance vkCreateInstance{}; + PFN_vkDestroyInstance vkDestroyInstance{}; + PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties{}; + PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties{}; + + PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT{}; + PFN_vkCreateDevice vkCreateDevice{}; + PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT{}; + PFN_vkDestroyDevice vkDestroyDevice{}; + PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR{}; + PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties{}; + PFN_vkEnumeratePhysicalDevices vkEnumeratePhysicalDevices{}; + PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr{}; + PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR{}; + PFN_vkGetPhysicalDeviceFormatProperties vkGetPhysicalDeviceFormatProperties{}; + PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties{}; + PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties{}; + PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR{}; + PFN_vkGetPhysicalDeviceQueueFamilyProperties vkGetPhysicalDeviceQueueFamilyProperties{}; + PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR{}; + PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR{}; + PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR{}; + PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR{}; + PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR{}; + PFN_vkQueuePresentKHR vkQueuePresentKHR{}; }; /// Table holding Vulkan device function pointers. -struct DeviceDispatch : public InstanceDispatch { - PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR; - PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers; - PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets; - PFN_vkAllocateMemory vkAllocateMemory; - PFN_vkBeginCommandBuffer vkBeginCommandBuffer; - PFN_vkBindBufferMemory vkBindBufferMemory; - PFN_vkBindImageMemory vkBindImageMemory; - PFN_vkCmdBeginQuery vkCmdBeginQuery; - PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass; - PFN_vkCmdBeginTransformFeedbackEXT vkCmdBeginTransformFeedbackEXT; - PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets; - PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer; - PFN_vkCmdBindPipeline vkCmdBindPipeline; - PFN_vkCmdBindTransformFeedbackBuffersEXT vkCmdBindTransformFeedbackBuffersEXT; - PFN_vkCmdBindVertexBuffers vkCmdBindVertexBuffers; - PFN_vkCmdBlitImage vkCmdBlitImage; - PFN_vkCmdClearAttachments vkCmdClearAttachments; - PFN_vkCmdCopyBuffer vkCmdCopyBuffer; - PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage; - PFN_vkCmdCopyImage vkCmdCopyImage; - PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer; - PFN_vkCmdDispatch vkCmdDispatch; - PFN_vkCmdDraw vkCmdDraw; - PFN_vkCmdDrawIndexed vkCmdDrawIndexed; - PFN_vkCmdEndQuery vkCmdEndQuery; - PFN_vkCmdEndRenderPass vkCmdEndRenderPass; - PFN_vkCmdEndTransformFeedbackEXT vkCmdEndTransformFeedbackEXT; - PFN_vkCmdFillBuffer vkCmdFillBuffer; - PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier; - PFN_vkCmdPushConstants vkCmdPushConstants; - PFN_vkCmdSetBlendConstants vkCmdSetBlendConstants; - PFN_vkCmdSetDepthBias vkCmdSetDepthBias; - PFN_vkCmdSetDepthBounds vkCmdSetDepthBounds; - PFN_vkCmdSetEvent vkCmdSetEvent; - PFN_vkCmdSetScissor vkCmdSetScissor; - PFN_vkCmdSetStencilCompareMask vkCmdSetStencilCompareMask; - PFN_vkCmdSetStencilReference vkCmdSetStencilReference; - PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask; - PFN_vkCmdSetViewport vkCmdSetViewport; - PFN_vkCmdWaitEvents vkCmdWaitEvents; - PFN_vkCmdBindVertexBuffers2EXT vkCmdBindVertexBuffers2EXT; - PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT; - PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT; - PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT; - PFN_vkCmdSetDepthTestEnableEXT vkCmdSetDepthTestEnableEXT; - PFN_vkCmdSetDepthWriteEnableEXT vkCmdSetDepthWriteEnableEXT; - PFN_vkCmdSetFrontFaceEXT vkCmdSetFrontFaceEXT; - PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT; - PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT; - PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT; - PFN_vkCreateBuffer vkCreateBuffer; - PFN_vkCreateBufferView vkCreateBufferView; - PFN_vkCreateCommandPool vkCreateCommandPool; - PFN_vkCreateComputePipelines vkCreateComputePipelines; - PFN_vkCreateDescriptorPool vkCreateDescriptorPool; - PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout; - PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR; - PFN_vkCreateEvent vkCreateEvent; - PFN_vkCreateFence vkCreateFence; - PFN_vkCreateFramebuffer vkCreateFramebuffer; - PFN_vkCreateGraphicsPipelines vkCreateGraphicsPipelines; - PFN_vkCreateImage vkCreateImage; - PFN_vkCreateImageView vkCreateImageView; - PFN_vkCreatePipelineLayout vkCreatePipelineLayout; - PFN_vkCreateQueryPool vkCreateQueryPool; - PFN_vkCreateRenderPass vkCreateRenderPass; - PFN_vkCreateSampler vkCreateSampler; - PFN_vkCreateSemaphore vkCreateSemaphore; - PFN_vkCreateShaderModule vkCreateShaderModule; - PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR; - PFN_vkDestroyBuffer vkDestroyBuffer; - PFN_vkDestroyBufferView vkDestroyBufferView; - PFN_vkDestroyCommandPool vkDestroyCommandPool; - PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool; - PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout; - PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR; - PFN_vkDestroyEvent vkDestroyEvent; - PFN_vkDestroyFence vkDestroyFence; - PFN_vkDestroyFramebuffer vkDestroyFramebuffer; - PFN_vkDestroyImage vkDestroyImage; - PFN_vkDestroyImageView vkDestroyImageView; - PFN_vkDestroyPipeline vkDestroyPipeline; - PFN_vkDestroyPipelineLayout vkDestroyPipelineLayout; - PFN_vkDestroyQueryPool vkDestroyQueryPool; - PFN_vkDestroyRenderPass vkDestroyRenderPass; - PFN_vkDestroySampler vkDestroySampler; - PFN_vkDestroySemaphore vkDestroySemaphore; - PFN_vkDestroyShaderModule vkDestroyShaderModule; - PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR; - PFN_vkDeviceWaitIdle vkDeviceWaitIdle; - PFN_vkEndCommandBuffer vkEndCommandBuffer; - PFN_vkFreeCommandBuffers vkFreeCommandBuffers; - PFN_vkFreeDescriptorSets vkFreeDescriptorSets; - PFN_vkFreeMemory vkFreeMemory; - PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements; - PFN_vkGetDeviceQueue vkGetDeviceQueue; - PFN_vkGetEventStatus vkGetEventStatus; - PFN_vkGetFenceStatus vkGetFenceStatus; - PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements; - PFN_vkGetQueryPoolResults vkGetQueryPoolResults; - PFN_vkMapMemory vkMapMemory; - PFN_vkQueueSubmit vkQueueSubmit; - PFN_vkResetFences vkResetFences; - PFN_vkResetQueryPoolEXT vkResetQueryPoolEXT; - PFN_vkUnmapMemory vkUnmapMemory; - PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR; - PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets; - PFN_vkWaitForFences vkWaitForFences; +struct DeviceDispatch : InstanceDispatch { + PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR{}; + PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers{}; + PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets{}; + PFN_vkAllocateMemory vkAllocateMemory{}; + PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; + PFN_vkBindBufferMemory vkBindBufferMemory{}; + PFN_vkBindImageMemory vkBindImageMemory{}; + PFN_vkCmdBeginQuery vkCmdBeginQuery{}; + PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; + PFN_vkCmdBeginTransformFeedbackEXT vkCmdBeginTransformFeedbackEXT{}; + PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; + PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets{}; + PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer{}; + PFN_vkCmdBindPipeline vkCmdBindPipeline{}; + PFN_vkCmdBindTransformFeedbackBuffersEXT vkCmdBindTransformFeedbackBuffersEXT{}; + PFN_vkCmdBindVertexBuffers vkCmdBindVertexBuffers{}; + PFN_vkCmdBlitImage vkCmdBlitImage{}; + PFN_vkCmdClearAttachments vkCmdClearAttachments{}; + PFN_vkCmdCopyBuffer vkCmdCopyBuffer{}; + PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; + PFN_vkCmdCopyImage vkCmdCopyImage{}; + PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; + PFN_vkCmdDispatch vkCmdDispatch{}; + PFN_vkCmdDraw vkCmdDraw{}; + PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; + PFN_vkCmdEndQuery vkCmdEndQuery{}; + PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; + PFN_vkCmdEndTransformFeedbackEXT vkCmdEndTransformFeedbackEXT{}; + PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; + PFN_vkCmdFillBuffer vkCmdFillBuffer{}; + PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier{}; + PFN_vkCmdPushConstants vkCmdPushConstants{}; + PFN_vkCmdSetBlendConstants vkCmdSetBlendConstants{}; + PFN_vkCmdSetDepthBias vkCmdSetDepthBias{}; + PFN_vkCmdSetDepthBounds vkCmdSetDepthBounds{}; + PFN_vkCmdSetEvent vkCmdSetEvent{}; + PFN_vkCmdSetScissor vkCmdSetScissor{}; + PFN_vkCmdSetStencilCompareMask vkCmdSetStencilCompareMask{}; + PFN_vkCmdSetStencilReference vkCmdSetStencilReference{}; + PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask{}; + PFN_vkCmdSetViewport vkCmdSetViewport{}; + PFN_vkCmdWaitEvents vkCmdWaitEvents{}; + PFN_vkCmdBindVertexBuffers2EXT vkCmdBindVertexBuffers2EXT{}; + PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT{}; + PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT{}; + PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT{}; + PFN_vkCmdSetDepthTestEnableEXT vkCmdSetDepthTestEnableEXT{}; + PFN_vkCmdSetDepthWriteEnableEXT vkCmdSetDepthWriteEnableEXT{}; + PFN_vkCmdSetFrontFaceEXT vkCmdSetFrontFaceEXT{}; + PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT{}; + PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT{}; + PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT{}; + PFN_vkCmdResolveImage vkCmdResolveImage{}; + PFN_vkCreateBuffer vkCreateBuffer{}; + PFN_vkCreateBufferView vkCreateBufferView{}; + PFN_vkCreateCommandPool vkCreateCommandPool{}; + PFN_vkCreateComputePipelines vkCreateComputePipelines{}; + PFN_vkCreateDescriptorPool vkCreateDescriptorPool{}; + PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout{}; + PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR{}; + PFN_vkCreateEvent vkCreateEvent{}; + PFN_vkCreateFence vkCreateFence{}; + PFN_vkCreateFramebuffer vkCreateFramebuffer{}; + PFN_vkCreateGraphicsPipelines vkCreateGraphicsPipelines{}; + PFN_vkCreateImage vkCreateImage{}; + PFN_vkCreateImageView vkCreateImageView{}; + PFN_vkCreatePipelineLayout vkCreatePipelineLayout{}; + PFN_vkCreateQueryPool vkCreateQueryPool{}; + PFN_vkCreateRenderPass vkCreateRenderPass{}; + PFN_vkCreateSampler vkCreateSampler{}; + PFN_vkCreateSemaphore vkCreateSemaphore{}; + PFN_vkCreateShaderModule vkCreateShaderModule{}; + PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR{}; + PFN_vkDestroyBuffer vkDestroyBuffer{}; + PFN_vkDestroyBufferView vkDestroyBufferView{}; + PFN_vkDestroyCommandPool vkDestroyCommandPool{}; + PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool{}; + PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout{}; + PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR{}; + PFN_vkDestroyEvent vkDestroyEvent{}; + PFN_vkDestroyFence vkDestroyFence{}; + PFN_vkDestroyFramebuffer vkDestroyFramebuffer{}; + PFN_vkDestroyImage vkDestroyImage{}; + PFN_vkDestroyImageView vkDestroyImageView{}; + PFN_vkDestroyPipeline vkDestroyPipeline{}; + PFN_vkDestroyPipelineLayout vkDestroyPipelineLayout{}; + PFN_vkDestroyQueryPool vkDestroyQueryPool{}; + PFN_vkDestroyRenderPass vkDestroyRenderPass{}; + PFN_vkDestroySampler vkDestroySampler{}; + PFN_vkDestroySemaphore vkDestroySemaphore{}; + PFN_vkDestroyShaderModule vkDestroyShaderModule{}; + PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR{}; + PFN_vkDeviceWaitIdle vkDeviceWaitIdle{}; + PFN_vkEndCommandBuffer vkEndCommandBuffer{}; + PFN_vkFreeCommandBuffers vkFreeCommandBuffers{}; + PFN_vkFreeDescriptorSets vkFreeDescriptorSets{}; + PFN_vkFreeMemory vkFreeMemory{}; + PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2{}; + PFN_vkGetDeviceQueue vkGetDeviceQueue{}; + PFN_vkGetEventStatus vkGetEventStatus{}; + PFN_vkGetFenceStatus vkGetFenceStatus{}; + PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements{}; + PFN_vkGetMemoryFdKHR vkGetMemoryFdKHR{}; +#ifdef _WIN32 + PFN_vkGetMemoryWin32HandleKHR vkGetMemoryWin32HandleKHR{}; +#endif + PFN_vkGetQueryPoolResults vkGetQueryPoolResults{}; + PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR{}; + PFN_vkMapMemory vkMapMemory{}; + PFN_vkQueueSubmit vkQueueSubmit{}; + PFN_vkResetFences vkResetFences{}; + PFN_vkResetQueryPoolEXT vkResetQueryPoolEXT{}; + PFN_vkSetDebugUtilsObjectNameEXT vkSetDebugUtilsObjectNameEXT{}; + PFN_vkSetDebugUtilsObjectTagEXT vkSetDebugUtilsObjectTagEXT{}; + PFN_vkUnmapMemory vkUnmapMemory{}; + PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR{}; + PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets{}; + PFN_vkWaitForFences vkWaitForFences{}; + PFN_vkWaitSemaphoresKHR vkWaitSemaphoresKHR{}; }; /// Loads instance agnostic function pointers. @@ -329,6 +359,9 @@ public: /// Construct an empty handle. Handle() = default; + /// Construct an empty handle. + Handle(std::nullptr_t) {} + /// Copying Vulkan objects is not supported and will never be. Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; @@ -467,9 +500,10 @@ public: PoolAllocations() = default; /// Construct an allocation. Errors are reported through IsOutOfPoolMemory(). - explicit PoolAllocations(std::unique_ptr<AllocationType[]> allocations, std::size_t num, - VkDevice device, PoolType pool, const DeviceDispatch& dld) noexcept - : allocations{std::move(allocations)}, num{num}, device{device}, pool{pool}, dld{&dld} {} + explicit PoolAllocations(std::unique_ptr<AllocationType[]> allocations_, std::size_t num_, + VkDevice device_, PoolType pool_, const DeviceDispatch& dld_) noexcept + : allocations{std::move(allocations_)}, num{num_}, device{device_}, pool{pool_}, + dld{&dld_} {} /// Copying Vulkan allocations is not supported and will never be. PoolAllocations(const PoolAllocations&) = delete; @@ -539,19 +573,14 @@ private: const DeviceDispatch* dld = nullptr; }; -using BufferView = Handle<VkBufferView, VkDevice, DeviceDispatch>; -using DebugCallback = Handle<VkDebugUtilsMessengerEXT, VkInstance, InstanceDispatch>; +using DebugUtilsMessenger = Handle<VkDebugUtilsMessengerEXT, VkInstance, InstanceDispatch>; using DescriptorSetLayout = Handle<VkDescriptorSetLayout, VkDevice, DeviceDispatch>; using DescriptorUpdateTemplateKHR = Handle<VkDescriptorUpdateTemplateKHR, VkDevice, DeviceDispatch>; -using Framebuffer = Handle<VkFramebuffer, VkDevice, DeviceDispatch>; -using ImageView = Handle<VkImageView, VkDevice, DeviceDispatch>; using Pipeline = Handle<VkPipeline, VkDevice, DeviceDispatch>; using PipelineLayout = Handle<VkPipelineLayout, VkDevice, DeviceDispatch>; using QueryPool = Handle<VkQueryPool, VkDevice, DeviceDispatch>; using RenderPass = Handle<VkRenderPass, VkDevice, DeviceDispatch>; using Sampler = Handle<VkSampler, VkDevice, DeviceDispatch>; -using Semaphore = Handle<VkSemaphore, VkDevice, DeviceDispatch>; -using ShaderModule = Handle<VkShaderModule, VkDevice, DeviceDispatch>; using SurfaceKHR = Handle<VkSurfaceKHR, VkInstance, InstanceDispatch>; using DescriptorSets = PoolAllocations<VkDescriptorSet, VkDescriptorPool>; @@ -562,16 +591,25 @@ class Instance : public Handle<VkInstance, NoOwner, InstanceDispatch> { using Handle<VkInstance, NoOwner, InstanceDispatch>::Handle; public: - /// Creates a Vulkan instance. Use "operator bool" for error handling. - static Instance Create(Span<const char*> layers, Span<const char*> extensions, - InstanceDispatch& dld) noexcept; + /// Creates a Vulkan instance. + /// @throw Exception on initialization error. + static Instance Create(u32 version, Span<const char*> layers, Span<const char*> extensions, + InstanceDispatch& dispatch); /// Enumerates physical devices. /// @return Physical devices and an empty handle on failure. - std::optional<std::vector<VkPhysicalDevice>> EnumeratePhysicalDevices(); + /// @throw Exception on Vulkan error. + std::vector<VkPhysicalDevice> EnumeratePhysicalDevices() const; + + /// Creates a debug callback messenger. + /// @throw Exception on creation failure. + DebugUtilsMessenger CreateDebugUtilsMessenger( + const VkDebugUtilsMessengerCreateInfoEXT& create_info) const; - /// Tries to create a debug callback messenger. Returns an empty handle on failure. - DebugCallback TryCreateDebugCallback(PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept; + /// Returns dispatch table. + const InstanceDispatch& Dispatch() const noexcept { + return *dld; + } }; class Queue { @@ -580,9 +618,11 @@ public: constexpr Queue() noexcept = default; /// Construct a queue handle. - constexpr Queue(VkQueue queue, const DeviceDispatch& dld) noexcept : queue{queue}, dld{&dld} {} + constexpr Queue(VkQueue queue_, const DeviceDispatch& dld_) noexcept + : queue{queue_}, dld{&dld_} {} - VkResult Submit(Span<VkSubmitInfo> submit_infos, VkFence fence) const noexcept { + VkResult Submit(Span<VkSubmitInfo> submit_infos, + VkFence fence = VK_NULL_HANDLE) const noexcept { return dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence); } @@ -601,6 +641,17 @@ class Buffer : public Handle<VkBuffer, VkDevice, DeviceDispatch> { public: /// Attaches a memory allocation. void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + +class BufferView : public Handle<VkBufferView, VkDevice, DeviceDispatch> { + using Handle<VkBufferView, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class Image : public Handle<VkImage, VkDevice, DeviceDispatch> { @@ -609,12 +660,32 @@ class Image : public Handle<VkImage, VkDevice, DeviceDispatch> { public: /// Attaches a memory allocation. void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + +class ImageView : public Handle<VkImageView, VkDevice, DeviceDispatch> { + using Handle<VkImageView, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class DeviceMemory : public Handle<VkDeviceMemory, VkDevice, DeviceDispatch> { using Handle<VkDeviceMemory, VkDevice, DeviceDispatch>::Handle; public: + int GetMemoryFdKHR() const; + +#ifdef _WIN32 + HANDLE GetMemoryWin32HandleKHR() const; +#endif + + /// Set object name. + void SetObjectNameEXT(const char* name) const; + u8* Map(VkDeviceSize offset, VkDeviceSize size) const { void* data; Check(dld->vkMapMemory(owner, handle, offset, size, 0, &data)); @@ -630,6 +701,9 @@ class Fence : public Handle<VkFence, VkDevice, DeviceDispatch> { using Handle<VkFence, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + VkResult Wait(u64 timeout = std::numeric_limits<u64>::max()) const noexcept { return dld->vkWaitForFences(owner, 1, &handle, true, timeout); } @@ -643,11 +717,22 @@ public: } }; +class Framebuffer : public Handle<VkFramebuffer, VkDevice, DeviceDispatch> { + using Handle<VkFramebuffer, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + class DescriptorPool : public Handle<VkDescriptorPool, VkDevice, DeviceDispatch> { using Handle<VkDescriptorPool, VkDevice, DeviceDispatch>::Handle; public: DescriptorSets Allocate(const VkDescriptorSetAllocateInfo& ai) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class CommandPool : public Handle<VkCommandPool, VkDevice, DeviceDispatch> { @@ -656,6 +741,9 @@ class CommandPool : public Handle<VkCommandPool, VkDevice, DeviceDispatch> { public: CommandBuffers Allocate(std::size_t num_buffers, VkCommandBufferLevel level = VK_COMMAND_BUFFER_LEVEL_PRIMARY) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class SwapchainKHR : public Handle<VkSwapchainKHR, VkDevice, DeviceDispatch> { @@ -669,18 +757,70 @@ class Event : public Handle<VkEvent, VkDevice, DeviceDispatch> { using Handle<VkEvent, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + VkResult GetStatus() const noexcept { return dld->vkGetEventStatus(owner, handle); } }; +class ShaderModule : public Handle<VkShaderModule, VkDevice, DeviceDispatch> { + using Handle<VkShaderModule, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + +class Semaphore : public Handle<VkSemaphore, VkDevice, DeviceDispatch> { + using Handle<VkSemaphore, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + + [[nodiscard]] u64 GetCounter() const { + u64 value; + Check(dld->vkGetSemaphoreCounterValueKHR(owner, handle, &value)); + return value; + } + + /** + * Waits for a timeline semaphore on the host. + * + * @param value Value to wait + * @param timeout Time in nanoseconds to timeout + * @return True on successful wait, false on timeout + */ + bool Wait(u64 value, u64 timeout = std::numeric_limits<u64>::max()) const { + const VkSemaphoreWaitInfoKHR wait_info{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .semaphoreCount = 1, + .pSemaphores = &handle, + .pValues = &value, + }; + const VkResult result = dld->vkWaitSemaphoresKHR(owner, &wait_info, timeout); + switch (result) { + case VK_SUCCESS: + return true; + case VK_TIMEOUT: + return false; + default: + throw Exception(result); + } + } +}; + class Device : public Handle<VkDevice, NoOwner, DeviceDispatch> { using Handle<VkDevice, NoOwner, DeviceDispatch>::Handle; public: static Device Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, Span<const char*> enabled_extensions, const void* next, - DeviceDispatch& dld) noexcept; + DeviceDispatch& dispatch); Queue GetQueue(u32 family_index) const noexcept; @@ -694,6 +834,8 @@ public: Semaphore CreateSemaphore() const; + Semaphore CreateSemaphore(const VkSemaphoreCreateInfo& ci) const; + Fence CreateFence(const VkFenceCreateInfo& ci) const; DescriptorPool CreateDescriptorPool(const VkDescriptorPoolCreateInfo& ci) const; @@ -721,7 +863,7 @@ public: ShaderModule CreateShaderModule(const VkShaderModuleCreateInfo& ci) const; - Event CreateNewEvent() const; + Event CreateEvent() const; SwapchainKHR CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const; @@ -729,7 +871,8 @@ public: DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const; - VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer) const noexcept; + VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer, + void* pnext = nullptr) const noexcept; VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept; @@ -767,8 +910,9 @@ class PhysicalDevice { public: constexpr PhysicalDevice() noexcept = default; - constexpr PhysicalDevice(VkPhysicalDevice physical_device, const InstanceDispatch& dld) noexcept - : physical_device{physical_device}, dld{&dld} {} + constexpr PhysicalDevice(VkPhysicalDevice physical_device_, + const InstanceDispatch& dld_) noexcept + : physical_device{physical_device_}, dld{&dld_} {} constexpr operator VkPhysicalDevice() const noexcept { return physical_device; @@ -807,8 +951,8 @@ class CommandBuffer { public: CommandBuffer() noexcept = default; - explicit CommandBuffer(VkCommandBuffer handle, const DeviceDispatch& dld) noexcept - : handle{handle}, dld{&dld} {} + explicit CommandBuffer(VkCommandBuffer handle_, const DeviceDispatch& dld_) noexcept + : handle{handle_}, dld{&dld_} {} const VkCommandBuffer* address() const noexcept { return &handle; @@ -887,6 +1031,12 @@ public: regions.data(), filter); } + void ResolveImage(VkImage src_image, VkImageLayout src_layout, VkImage dst_image, + VkImageLayout dst_layout, Span<VkImageResolve> regions) { + dld->vkCmdResolveImage(handle, src_image, src_layout, dst_image, dst_layout, regions.size(), + regions.data()); + } + void Dispatch(u32 x, u32 y, u32 z) const noexcept { dld->vkCmdDispatch(handle, x, y, z); } @@ -901,6 +1051,29 @@ public: image_barriers.size(), image_barriers.data()); } + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags = 0) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, {}); + } + + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkMemoryBarrier& memory_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, memory_barrier, {}, {}); + } + + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkBufferMemoryBarrier& buffer_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, buffer_barrier, {}); + } + + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkImageMemoryBarrier& image_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, image_barrier); + } + void CopyBufferToImage(VkBuffer src_buffer, VkImage dst_image, VkImageLayout dst_image_layout, Span<VkBufferImageCopy> regions) const noexcept { dld->vkCmdCopyBufferToImage(handle, src_buffer, dst_image, dst_image_layout, regions.size(), @@ -934,6 +1107,13 @@ public: dld->vkCmdPushConstants(handle, layout, flags, offset, size, values); } + template <typename T> + void PushConstants(VkPipelineLayout layout, VkShaderStageFlags flags, + const T& data) const noexcept { + static_assert(std::is_trivially_copyable_v<T>, "<data> is not trivially copyable"); + dld->vkCmdPushConstants(handle, layout, flags, 0, static_cast<u32>(sizeof(T)), &data); + } + void SetViewport(u32 first, Span<VkViewport> viewports) const noexcept { dld->vkCmdSetViewport(handle, first, viewports.size(), viewports.data()); } @@ -1043,11 +1223,27 @@ public: counter_buffers, counter_buffer_offsets); } + void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { + const VkDebugUtilsLabelEXT label_info{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pNext = nullptr, + .pLabelName = label, + .color{color[0], color[1], color[2], color[3]}, + }; + dld->vkCmdBeginDebugUtilsLabelEXT(handle, &label_info); + } + + void EndDebugUtilsLabelEXT() const noexcept { + dld->vkCmdEndDebugUtilsLabelEXT(handle); + } + private: VkCommandBuffer handle; const DeviceDispatch* dld; }; +u32 AvailableVersion(const InstanceDispatch& dld) noexcept; + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld); |