diff options
Diffstat (limited to 'src/video_core')
93 files changed, 4443 insertions, 4371 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 4b0c6346f..91df062d7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -2,6 +2,8 @@ add_library(video_core STATIC buffer_cache/buffer_block.h buffer_cache/buffer_cache.h buffer_cache/map_interval.h + dirty_flags.cpp + dirty_flags.h dma_pusher.cpp dma_pusher.h engines/const_buffer_engine_interface.h @@ -63,14 +65,12 @@ add_library(video_core STATIC renderer_opengl/gl_shader_decompiler.h renderer_opengl/gl_shader_disk_cache.cpp renderer_opengl/gl_shader_disk_cache.h - renderer_opengl/gl_shader_gen.cpp - renderer_opengl/gl_shader_gen.h renderer_opengl/gl_shader_manager.cpp renderer_opengl/gl_shader_manager.h renderer_opengl/gl_shader_util.cpp renderer_opengl/gl_shader_util.h - renderer_opengl/gl_state.cpp - renderer_opengl/gl_state.h + renderer_opengl/gl_state_tracker.cpp + renderer_opengl/gl_state_tracker.h renderer_opengl/gl_stream_buffer.cpp renderer_opengl/gl_stream_buffer.h renderer_opengl/gl_texture_cache.cpp @@ -116,8 +116,6 @@ add_library(video_core STATIC shader/ast.h shader/compiler_settings.cpp shader/compiler_settings.h - shader/const_buffer_locker.cpp - shader/const_buffer_locker.h shader/control_flow.cpp shader/control_flow.h shader/decode.cpp @@ -126,9 +124,13 @@ add_library(video_core STATIC shader/node_helper.cpp shader/node_helper.h shader/node.h + shader/registry.cpp + shader/registry.h shader/shader_ir.cpp shader/shader_ir.h shader/track.cpp + shader/transform_feedback.cpp + shader/transform_feedback.h surface.cpp surface.h texture_cache/format_lookup_table.cpp @@ -198,6 +200,8 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_shader_util.h renderer_vulkan/vk_staging_buffer_pool.cpp renderer_vulkan/vk_staging_buffer_pool.h + renderer_vulkan/vk_state_tracker.cpp + renderer_vulkan/vk_state_tracker.h renderer_vulkan/vk_stream_buffer.cpp renderer_vulkan/vk_stream_buffer.h renderer_vulkan/vk_swapchain.cpp diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp new file mode 100644 index 000000000..e16075993 --- /dev/null +++ b/src/video_core/dirty_flags.cpp @@ -0,0 +1,38 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <cstddef> + +#include "common/common_types.h" +#include "video_core/dirty_flags.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace VideoCommon::Dirty { + +using Tegra::Engines::Maxwell3D; + +void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { + static constexpr std::size_t num_per_rt = NUM(rt[0]); + static constexpr std::size_t begin = OFF(rt); + static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; + for (std::size_t rt = 0; rt < Maxwell3D::Regs::NumRenderTargets; ++rt) { + FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt); + } + FillBlock(tables[1], begin, num, RenderTargets); + + static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets}; + for (std::size_t i = 0; i < std::size(zeta_flags); ++i) { + const u8 flag = zeta_flags[i]; + auto& table = tables[i]; + table[OFF(zeta_enable)] = flag; + table[OFF(zeta_width)] = flag; + table[OFF(zeta_height)] = flag; + FillBlock(table, OFF(zeta), NUM(zeta), flag); + } +} + +} // namespace VideoCommon::Dirty diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h new file mode 100644 index 000000000..3f6c1d83a --- /dev/null +++ b/src/video_core/dirty_flags.h @@ -0,0 +1,49 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <cstddef> +#include <iterator> + +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" + +namespace VideoCommon::Dirty { + +enum : u8 { + NullEntry = 0, + + RenderTargets, + ColorBuffer0, + ColorBuffer1, + ColorBuffer2, + ColorBuffer3, + ColorBuffer4, + ColorBuffer5, + ColorBuffer6, + ColorBuffer7, + ZetaBuffer, + + LastCommonEntry, +}; + +template <typename Integer> +void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Table& table, std::size_t begin, + std::size_t num, Integer dirty_index) { + const auto it = std::begin(table) + begin; + std::fill(it, it + num, static_cast<u8>(dirty_index)); +} + +template <typename Integer1, typename Integer2> +void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_t begin, + std::size_t num, Integer1 index_a, Integer2 index_b) { + FillBlock(tables[0], begin, num, index_a); + FillBlock(tables[1], begin, num, index_b); +} + +void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); + +} // namespace VideoCommon::Dirty diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 0094fd715..713c14182 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); // On entering GPU code, assume all memory may be touched by the ARM core. - gpu.Maxwell3D().dirty.OnMemoryWrite(); + gpu.Maxwell3D().OnMemoryWrite(); dma_pushbuffer_subindex = 0; diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index d56a47710..724ee0fd6 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -16,11 +16,12 @@ namespace Tegra::Engines { struct SamplerDescriptor { union { - BitField<0, 20, Tegra::Shader::TextureType> texture_type; - BitField<20, 1, u32> is_array; - BitField<21, 1, u32> is_buffer; - BitField<22, 1, u32> is_shadow; - u32 raw{}; + u32 raw = 0; + BitField<0, 2, Tegra::Shader::TextureType> texture_type; + BitField<2, 3, Tegra::Texture::ComponentType> component_type; + BitField<5, 1, u32> is_array; + BitField<6, 1, u32> is_buffer; + BitField<7, 1, u32> is_shadow; }; bool operator==(const SamplerDescriptor& rhs) const noexcept { @@ -31,68 +32,48 @@ struct SamplerDescriptor { return !operator==(rhs); } - static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) { + static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) { + using Tegra::Shader::TextureType; SamplerDescriptor result; - switch (tic_texture_type) { + + // This is going to be used to determine the shading language type. + // Because of that we don't care about all component types on color textures. + result.component_type.Assign(tic.r_type.Value()); + + switch (tic.texture_type.Value()) { case Tegra::Texture::TextureType::Texture1D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture1D); return result; case Tegra::Texture::TextureType::Texture2D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; case Tegra::Texture::TextureType::Texture3D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture3D); return result; case Tegra::Texture::TextureType::TextureCubemap: - result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::TextureCube); return result; case Tegra::Texture::TextureType::Texture1DArray: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); + result.texture_type.Assign(TextureType::Texture1D); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture2DArray: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); + result.texture_type.Assign(TextureType::Texture2D); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture1DBuffer: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); - result.is_array.Assign(0); + result.texture_type.Assign(TextureType::Texture1D); result.is_buffer.Assign(1); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture2DNoMipmap: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; case Tegra::Texture::TextureType::TextureCubeArray: - result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); + result.texture_type.Assign(TextureType::TextureCube); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; default: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; } } diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 4b824aa4e..368c75a66 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -39,7 +39,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); } break; } @@ -89,7 +89,7 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); + SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); return result; } @@ -119,14 +119,6 @@ Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const { Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); - const auto r_type{tic_entry.r_type.Value()}; - const auto g_type{tic_entry.g_type.Value()}; - const auto b_type{tic_entry.b_type.Value()}; - const auto a_type{tic_entry.a_type.Value()}; - - // TODO(Subv): Different data types for separate components are not supported - DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type); - return tic_entry; } diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index fa4a7c5c1..597872e43 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index b28de1092..ce536e29b 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -26,7 +26,8 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { - InitDirtySettings(); + dirty.flags.flip(); + InitializeRegisterDefaults(); } @@ -75,8 +76,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.stencil_back_mask = 0xFFFFFFFF; regs.depth_test_func = Regs::ComparisonOp::Always; - regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise; - regs.cull.cull_face = Regs::Cull::CullFace::Back; + regs.front_face = Regs::FrontFace::CounterClockWise; + regs.cull_face = Regs::CullFace::Back; // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a // register carrying a default value. Assume it's OpenGL's default (1). @@ -95,7 +96,7 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; - regs.cull.front_face = Maxwell3D::Regs::Cull::FrontFace::ClockWise; + regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true; @@ -103,164 +104,6 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -#define DIRTY_REGS_POS(field_name) static_cast<u8>(offsetof(Maxwell3D::DirtyRegs, field_name)) - -void Maxwell3D::InitDirtySettings() { - const auto set_block = [this](std::size_t start, std::size_t range, u8 position) { - const auto start_itr = dirty_pointers.begin() + start; - const auto end_itr = start_itr + range; - std::fill(start_itr, end_itr, position); - }; - dirty.regs.fill(true); - - // Init Render Targets - constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); - constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt); - constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8; - u8 rt_dirty_reg = DIRTY_REGS_POS(render_target); - for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) { - set_block(rt_reg, registers_per_rt, rt_dirty_reg); - ++rt_dirty_reg; - } - constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer); - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag; - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag; - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag; - constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); - constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta); - set_block(zeta_reg, registers_in_zeta, depth_buffer_flag); - - // Init Vertex Arrays - constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array); - constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32); - constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays; - u8 va_dirty_reg = DIRTY_REGS_POS(vertex_array); - u8 vi_dirty_reg = DIRTY_REGS_POS(vertex_instance); - for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end; - vertex_reg += vertex_array_size) { - set_block(vertex_reg, 3, va_dirty_reg); - // The divisor concerns vertex array instances - dirty_pointers[static_cast<std::size_t>(vertex_reg) + 3] = vi_dirty_reg; - ++va_dirty_reg; - ++vi_dirty_reg; - } - constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit); - constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32); - constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays; - va_dirty_reg = DIRTY_REGS_POS(vertex_array); - for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end; - vertex_reg += vertex_limit_size) { - set_block(vertex_reg, vertex_limit_size, va_dirty_reg); - va_dirty_reg++; - } - constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays); - constexpr u32 vertex_instance_size = - sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32); - constexpr u32 vertex_instance_end = - vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays; - vi_dirty_reg = DIRTY_REGS_POS(vertex_instance); - for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end; - vertex_reg += vertex_instance_size) { - set_block(vertex_reg, vertex_instance_size, vi_dirty_reg); - vi_dirty_reg++; - } - set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(), - DIRTY_REGS_POS(vertex_attrib_format)); - - // Init Shaders - constexpr u32 shader_registers_count = - sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); - set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count, - DIRTY_REGS_POS(shaders)); - - // State - - // Viewport - constexpr u8 viewport_dirty_reg = DIRTY_REGS_POS(viewport); - constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports); - constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32); - set_block(viewport_start, viewport_size, viewport_dirty_reg); - constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control); - constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32); - set_block(view_volume_start, view_volume_size, viewport_dirty_reg); - - // Viewport transformation - constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform); - constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32); - set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform)); - - // Cullmode - constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull); - constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32); - set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode)); - - // Screen y control - dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control); - - // Primitive Restart - constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart); - constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32); - set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart)); - - // Depth Test - constexpr u8 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test); - dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg; - - // Stencil Test - constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test); - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg; - - // Color Mask - constexpr u8 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask); - dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg; - set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32), - color_mask_dirty_reg); - // Blend State - constexpr u8 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state); - set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32), - blend_state_dirty_reg); - dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg; - set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg); - set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32), - blend_state_dirty_reg); - - // Scissor State - constexpr u8 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test); - set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32), - scissor_test_dirty_reg); - - // Polygon Offset - constexpr u8 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset); - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg; - - // Depth bounds - constexpr u8 depth_bounds_values_dirty_reg = DIRTY_REGS_POS(depth_bounds_values); - dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[0])] = depth_bounds_values_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[1])] = depth_bounds_values_dirty_reg; -} - void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { // Reset the current macro. executing_macro = 0; @@ -319,19 +162,9 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { if (regs.reg_array[method] != method_call.argument) { regs.reg_array[method] = method_call.argument; - const std::size_t dirty_reg = dirty_pointers[method]; - if (dirty_reg) { - dirty.regs[dirty_reg] = true; - if (dirty_reg >= DIRTY_REGS_POS(vertex_array) && - dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) { - dirty.vertex_array_buffers = true; - } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) && - dirty_reg < DIRTY_REGS_POS(vertex_instances)) { - dirty.vertex_instances = true; - } else if (dirty_reg >= DIRTY_REGS_POS(render_target) && - dirty_reg < DIRTY_REGS_POS(render_settings)) { - dirty.render_settings = true; - } + + for (const auto& table : dirty.tables) { + dirty.flags[table[method]] = true; } } @@ -419,7 +252,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - dirty.OnMemoryWrite(); + OnMemoryWrite(); } break; } @@ -727,7 +560,7 @@ void Maxwell3D::FinishCBData() { const u32 id = cb_data_state.id; memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); - dirty.OnMemoryWrite(); + OnMemoryWrite(); cb_data_state.id = null_cb_data; cb_data_state.current = null_cb_data; @@ -805,7 +638,7 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); + SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 6ea7cc6a5..8a9e9992e 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include <array> #include <bitset> +#include <limits> #include <optional> #include <type_traits> #include <unordered_map> @@ -66,6 +67,7 @@ public: static constexpr std::size_t NumVaryings = 31; static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number static constexpr std::size_t NumClipDistances = 8; + static constexpr std::size_t NumTransformFeedbackBuffers = 4; static constexpr std::size_t MaxShaderProgram = 6; static constexpr std::size_t MaxShaderStage = 5; // Maximum number of const buffers per shader stage. @@ -431,21 +433,15 @@ public: GeneratedPrimitives = 0x1F, }; - struct Cull { - enum class FrontFace : u32 { - ClockWise = 0x0900, - CounterClockWise = 0x0901, - }; - - enum class CullFace : u32 { - Front = 0x0404, - Back = 0x0405, - FrontAndBack = 0x0408, - }; + enum class FrontFace : u32 { + ClockWise = 0x0900, + CounterClockWise = 0x0901, + }; - u32 enabled; - FrontFace front_face; - CullFace cull_face; + enum class CullFace : u32 { + Front = 0x0404, + Back = 0x0405, + FrontAndBack = 0x0408, }; struct Blend { @@ -529,6 +525,12 @@ public: FractionalEven = 2, }; + enum class PolygonMode : u32 { + Point = 0x1b00, + Line = 0x1b01, + Fill = 0x1b02, + }; + struct RenderTargetConfig { u32 address_high; u32 address_low; @@ -574,7 +576,7 @@ public: f32 translate_z; INSERT_UNION_PADDING_WORDS(2); - Common::Rectangle<s32> GetRect() const { + Common::Rectangle<f32> GetRect() const { return { GetX(), // left GetY() + GetHeight(), // top @@ -583,20 +585,20 @@ public: }; }; - s32 GetX() const { - return static_cast<s32>(std::max(0.0f, translate_x - std::fabs(scale_x))); + f32 GetX() const { + return std::max(0.0f, translate_x - std::fabs(scale_x)); } - s32 GetY() const { - return static_cast<s32>(std::max(0.0f, translate_y - std::fabs(scale_y))); + f32 GetY() const { + return std::max(0.0f, translate_y - std::fabs(scale_y)); } - s32 GetWidth() const { - return static_cast<s32>(translate_x + std::fabs(scale_x)) - GetX(); + f32 GetWidth() const { + return translate_x + std::fabs(scale_x) - GetX(); } - s32 GetHeight() const { - return static_cast<s32>(translate_y + std::fabs(scale_y)) - GetY(); + f32 GetHeight() const { + return translate_y + std::fabs(scale_y) - GetY(); } }; @@ -626,6 +628,29 @@ public: float depth_range_far; }; + struct TransformFeedbackBinding { + u32 buffer_enable; + u32 address_high; + u32 address_low; + s32 buffer_size; + s32 buffer_offset; + INSERT_UNION_PADDING_WORDS(3); + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + }; + static_assert(sizeof(TransformFeedbackBinding) == 32); + + struct TransformFeedbackLayout { + u32 stream; + u32 varying_count; + u32 stride; + INSERT_UNION_PADDING_WORDS(1); + }; + static_assert(sizeof(TransformFeedbackLayout) == 16); + bool IsShaderConfigEnabled(std::size_t index) const { // The VertexB is always enabled. if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) { @@ -634,6 +659,10 @@ public: return shader_config[index].enable != 0; } + bool IsShaderConfigEnabled(Regs::ShaderProgram type) const { + return IsShaderConfigEnabled(static_cast<std::size_t>(type)); + } + union { struct { INSERT_UNION_PADDING_WORDS(0x45); @@ -682,7 +711,13 @@ public: u32 rasterize_enable; - INSERT_UNION_PADDING_WORDS(0xF1); + std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings; + + INSERT_UNION_PADDING_WORDS(0xC0); + + std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts; + + INSERT_UNION_PADDING_WORDS(0x1); u32 tfb_enabled; @@ -710,7 +745,12 @@ public: s32 clear_stencil; - INSERT_UNION_PADDING_WORDS(0x7); + INSERT_UNION_PADDING_WORDS(0x2); + + PolygonMode polygon_mode_front; + PolygonMode polygon_mode_back; + + INSERT_UNION_PADDING_WORDS(0x3); u32 polygon_offset_point_enable; u32 polygon_offset_line_enable; @@ -769,7 +809,11 @@ public: BitField<12, 4, u32> viewport; } clear_flags; - INSERT_UNION_PADDING_WORDS(0x19); + INSERT_UNION_PADDING_WORDS(0x10); + + u32 fill_rectangle; + + INSERT_UNION_PADDING_WORDS(0x8); std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; @@ -872,16 +916,7 @@ public: INSERT_UNION_PADDING_WORDS(0x35); - union { - BitField<0, 1, u32> c0; - BitField<1, 1, u32> c1; - BitField<2, 1, u32> c2; - BitField<3, 1, u32> c3; - BitField<4, 1, u32> c4; - BitField<5, 1, u32> c5; - BitField<6, 1, u32> c6; - BitField<7, 1, u32> c7; - } clip_distance_enabled; + u32 clip_distance_enabled; u32 samplecnt_enable; @@ -1060,7 +1095,9 @@ public: INSERT_UNION_PADDING_WORDS(1); - Cull cull; + u32 cull_test_enabled; + FrontFace front_face; + CullFace cull_face; u32 pixel_center_integer; @@ -1199,7 +1236,11 @@ public: u32 tex_cb_index; - INSERT_UNION_PADDING_WORDS(0x395); + INSERT_UNION_PADDING_WORDS(0x7D); + + std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs; + + INSERT_UNION_PADDING_WORDS(0x298); struct { /// Compressed address of a buffer that holds information about bound SSBOs. @@ -1238,79 +1279,6 @@ public: State state{}; - struct DirtyRegs { - static constexpr std::size_t NUM_REGS = 256; - static_assert(NUM_REGS - 1 <= std::numeric_limits<u8>::max()); - - union { - struct { - bool null_dirty; - - // Vertex Attributes - bool vertex_attrib_format; - - // Vertex Arrays - std::array<bool, 32> vertex_array; - - bool vertex_array_buffers; - - // Vertex Instances - std::array<bool, 32> vertex_instance; - - bool vertex_instances; - - // Render Targets - std::array<bool, 8> render_target; - bool depth_buffer; - - bool render_settings; - - // Shaders - bool shaders; - - // Rasterizer State - bool viewport; - bool clip_coefficient; - bool cull_mode; - bool primitive_restart; - bool depth_test; - bool stencil_test; - bool blend_state; - bool scissor_test; - bool transform_feedback; - bool color_mask; - bool polygon_offset; - bool depth_bounds_values; - - // Complementary - bool viewport_transform; - bool screen_y_control; - - bool memory_general; - }; - std::array<bool, NUM_REGS> regs; - }; - - void ResetVertexArrays() { - vertex_array.fill(true); - vertex_array_buffers = true; - } - - void ResetRenderTargets() { - depth_buffer = true; - render_target.fill(true); - render_settings = true; - } - - void OnMemoryWrite() { - shaders = true; - memory_general = true; - ResetRenderTargets(); - ResetVertexArrays(); - } - - } dirty{}; - /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; @@ -1356,6 +1324,11 @@ public: return execute_on; } + /// Notify a memory write has happened. + void OnMemoryWrite() { + dirty.flags |= dirty.on_write_stores; + } + enum class MMEDrawMode : u32 { Undefined, Array, @@ -1371,6 +1344,16 @@ public: u32 gl_end_count{}; } mme_draw; + struct DirtyState { + using Flags = std::bitset<std::numeric_limits<u8>::max()>; + using Table = std::array<u8, Regs::NUM_REGS>; + using Tables = std::array<Table, 2>; + + Flags flags; + Flags on_write_stores; + Tables tables{}; + } dirty; + private: void InitializeRegisterDefaults(); @@ -1417,8 +1400,6 @@ private: /// Retrieves information about a specific TSC entry from the TSC buffer. Texture::TSCEntry GetTSCEntry(u32 tsc_index) const; - void InitDirtySettings(); - /** * Call a macro on this engine. * @param method Method to call @@ -1485,6 +1466,8 @@ ASSERT_REG_POSITION(tess_mode, 0xC8); ASSERT_REG_POSITION(tess_level_outer, 0xC9); ASSERT_REG_POSITION(tess_level_inner, 0xCD); ASSERT_REG_POSITION(rasterize_enable, 0xDF); +ASSERT_REG_POSITION(tfb_bindings, 0xE0); +ASSERT_REG_POSITION(tfb_layouts, 0x1C0); ASSERT_REG_POSITION(tfb_enabled, 0x1D1); ASSERT_REG_POSITION(rt, 0x200); ASSERT_REG_POSITION(viewport_transform, 0x280); @@ -1494,6 +1477,8 @@ ASSERT_REG_POSITION(depth_mode, 0x35F); ASSERT_REG_POSITION(clear_color[0], 0x360); ASSERT_REG_POSITION(clear_depth, 0x364); ASSERT_REG_POSITION(clear_stencil, 0x368); +ASSERT_REG_POSITION(polygon_mode_front, 0x36B); +ASSERT_REG_POSITION(polygon_mode_back, 0x36C); ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); @@ -1507,6 +1492,7 @@ ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(depth_bounds, 0x3E7); ASSERT_REG_POSITION(zeta, 0x3F8); ASSERT_REG_POSITION(clear_flags, 0x43E); +ASSERT_REG_POSITION(fill_rectangle, 0x44F); ASSERT_REG_POSITION(vertex_attrib_format, 0x458); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); @@ -1561,7 +1547,9 @@ ASSERT_REG_POSITION(index_array, 0x5F2); ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); ASSERT_REG_POSITION(vp_point_size, 0x644); -ASSERT_REG_POSITION(cull, 0x646); +ASSERT_REG_POSITION(cull_test_enabled, 0x646); +ASSERT_REG_POSITION(front_face, 0x647); +ASSERT_REG_POSITION(cull_face, 0x648); ASSERT_REG_POSITION(pixel_center_integer, 0x649); ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); ASSERT_REG_POSITION(view_volume_clip_control, 0x64F); @@ -1578,6 +1566,7 @@ ASSERT_REG_POSITION(firmware, 0x8C0); ASSERT_REG_POSITION(const_buffer, 0x8E0); ASSERT_REG_POSITION(cb_bind[0], 0x904); ASSERT_REG_POSITION(tex_cb_index, 0x982); +ASSERT_REG_POSITION(tfb_varying_locs, 0xA00); ASSERT_REG_POSITION(ssbo_info, 0xD18); ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A); ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ad8453c5f..c2610f992 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -57,7 +57,7 @@ void MaxwellDMA::HandleCopy() { } // All copies here update the main memory, so mark all rasterizer states as invalid. - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index c9bc83cd7..eba42deb4 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -911,14 +911,9 @@ union Instruction { } fadd32i; union { - BitField<20, 8, u64> shift_position; - BitField<28, 8, u64> shift_length; - BitField<48, 1, u64> negate_b; - BitField<49, 1, u64> negate_a; - - u64 GetLeftShiftValue() const { - return 32 - (shift_position + shift_length); - } + BitField<40, 1, u64> brev; + BitField<47, 1, u64> rd_cc; + BitField<48, 1, u64> is_signed; } bfe; union { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index ba8c9d665..64acb17df 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 { RGBA32_FLOAT = 0xC0, RGBA32_UINT = 0xC2, RGBA16_UNORM = 0xC6, + RGBA16_SNORM = 0xC7, RGBA16_UINT = 0xC9, RGBA16_FLOAT = 0xCA, RG32_FLOAT = 0xCB, diff --git a/src/video_core/guest_driver.cpp b/src/video_core/guest_driver.cpp index 6adef459e..f058f2744 100644 --- a/src/video_core/guest_driver.cpp +++ b/src/video_core/guest_driver.cpp @@ -4,13 +4,15 @@ #include <algorithm> #include <limits> +#include <vector> +#include "common/common_types.h" #include "video_core/guest_driver.h" namespace VideoCore { -void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) { - if (texture_handler_size_deduced) { +void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) { + if (texture_handler_size) { return; } const std::size_t size = bound_offsets.size(); @@ -29,7 +31,6 @@ void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offse if (min_val > 2) { return; } - texture_handler_size_deduced = true; texture_handler_size = min_texture_handler_size * min_val; } diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h index fc1917347..99450777e 100644 --- a/src/video_core/guest_driver.h +++ b/src/video_core/guest_driver.h @@ -4,6 +4,7 @@ #pragma once +#include <optional> #include <vector> #include "common/common_types.h" @@ -17,25 +18,29 @@ namespace VideoCore { */ class GuestDriverProfile { public: - void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets); + explicit GuestDriverProfile() = default; + explicit GuestDriverProfile(std::optional<u32> texture_handler_size) + : texture_handler_size{texture_handler_size} {} + + void DeduceTextureHandlerSize(std::vector<u32> bound_offsets); u32 GetTextureHandlerSize() const { - return texture_handler_size; + return texture_handler_size.value_or(default_texture_handler_size); } - bool TextureHandlerSizeKnown() const { - return texture_handler_size_deduced; + bool IsTextureHandlerSizeKnown() const { + return texture_handler_size.has_value(); } private: // Minimum size of texture handler any driver can use. static constexpr u32 min_texture_handler_size = 4; - // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily - // use 4 bytes instead. Thus, certain drivers may squish the size. + + // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead. + // Thus, certain drivers may squish the size. static constexpr u32 default_texture_handler_size = 8; - u32 texture_handler_size = default_texture_handler_size; - bool texture_handler_size_deduced = false; + std::optional<u32> texture_handler_size = default_texture_handler_size; }; } // namespace VideoCore diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index aea010087..073bdb491 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -174,7 +174,7 @@ private: /// End of address space, based on address space in bits. static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; - Common::PageTable page_table{page_bits}; + Common::BackingPageTable page_table{page_bits}; VMAMap vma_map; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index f2c83266e..6d522c318 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::R8UI>, MortonCopy<true, PixelFormat::RGBA16F>, MortonCopy<true, PixelFormat::RGBA16U>, + MortonCopy<true, PixelFormat::RGBA16S>, MortonCopy<true, PixelFormat::RGBA16UI>, MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>, @@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::R8U>, MortonCopy<false, PixelFormat::R8UI>, MortonCopy<false, PixelFormat::RGBA16F>, + MortonCopy<false, PixelFormat::RGBA16S>, MortonCopy<false, PixelFormat::RGBA16U>, MortonCopy<false, PixelFormat::RGBA16UI>, MortonCopy<false, PixelFormat::R11FG11FB10F>, diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f18eaf4bc..1a68e3caa 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -25,7 +25,6 @@ constexpr std::size_t NumQueryTypes = 1; enum class LoadCallbackStage { Prepare, - Decompile, Build, Complete, }; @@ -89,6 +88,9 @@ public: virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, const DiskResourceLoadCallback& callback = {}) {} + /// Initializes renderer dirty flags + virtual void SetupDirtyFlags() {} + /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. GuestDriverProfile& AccessGuestDriverProfile() { return guest_driver_profile; diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp index 874ed3c6e..b8a512cb6 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp @@ -11,7 +11,6 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" -#include "video_core/renderer_opengl/gl_state.h" namespace OpenGL { @@ -36,8 +35,7 @@ OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheK framebuffer.Create(); // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs. - local_state.draw.draw_framebuffer = framebuffer.handle; - local_state.ApplyFramebufferState(); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle); if (key.zeta) { const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil; diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h index 02ec80ae9..8f698fee0 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h @@ -13,7 +13,6 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/gl_texture_cache.h" namespace OpenGL { @@ -63,7 +62,6 @@ public: private: OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key); - OpenGLState local_state; std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e1965fb21..063f41327 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -28,7 +28,6 @@ #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" @@ -36,6 +35,7 @@ namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; +using Tegra::Engines::ShaderType; using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; @@ -54,10 +54,11 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { +constexpr std::size_t NumSupportedVertexAttributes = 16; + template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - Tegra::Engines::ShaderType shader_type, - std::size_t index = 0) { + ShaderType shader_type, std::size_t index = 0) { if (entry.IsBindless()) { const Tegra::Texture::TextureHandle tex_handle = engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset()); @@ -74,7 +75,7 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry } std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry) { + const ConstBufferEntry& entry) { if (!entry.IsIndirect()) { return entry.GetSize(); } @@ -88,18 +89,19 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +void oglEnable(GLenum cap, bool state) { + (state ? glEnable : glDisable)(cap); +} + } // Anonymous namespace RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info) - : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device}, + ScreenInfo& info, GLShader::ProgramManager& program_manager, + StateTracker& state_tracker) + : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, - screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { - shader_program_manager = std::make_unique<GLShader::ProgramManager>(); - state.draw.shader_program = 0; - state.Apply(); - - LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); + screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, + buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { CheckExtensions(); } @@ -113,93 +115,72 @@ void RasterizerOpenGL::CheckExtensions() { } } -GLuint RasterizerOpenGL::SetupVertexFormat() { +void RasterizerOpenGL::SetupVertexFormat() { auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - if (!gpu.dirty.vertex_attrib_format) { - return state.draw.vertex_array; + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexFormats]) { + return; } - gpu.dirty.vertex_attrib_format = false; + flags[Dirty::VertexFormats] = false; MICROPROFILE_SCOPE(OpenGL_VAO); - auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format); - auto& vao_entry = iter->second; - - if (is_cache_miss) { - vao_entry.Create(); - const GLuint vao = vao_entry.handle; - - // Eventhough we are using DSA to create this vertex array, there is a bug on Intel's blob - // that fails to properly create the vertex array if it's not bound even after creating it - // with glCreateVertexArrays - state.draw.vertex_array = vao; - state.ApplyVertexArrayState(); - - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. - // Enables the first 16 vertex attributes always, as we don't know which ones are actually - // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16 - // for now to avoid OpenGL errors. - // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't - // assume every shader uses them all. - for (u32 index = 0; index < 16; ++index) { - const auto& attrib = regs.vertex_attrib_format[index]; - - // Ignore invalid attributes. - if (!attrib.IsValid()) - continue; - - const auto& buffer = regs.vertex_array[attrib.buffer]; - LOG_TRACE(Render_OpenGL, - "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}", - index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(), - attrib.offset.Value(), attrib.IsNormalized()); - - ASSERT(buffer.IsEnabled()); - - glEnableVertexArrayAttrib(vao, index); - if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt || - attrib.type == - Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) { - glVertexArrayAttribIFormat(vao, index, attrib.ComponentCount(), - MaxwellToGL::VertexType(attrib), attrib.offset); - } else { - glVertexArrayAttribFormat( - vao, index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), - attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); - } - glVertexArrayAttribBinding(vao, index, attrib.buffer); + // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables + // the first 16 vertex attributes always, as we don't know which ones are actually used until + // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to + // avoid OpenGL errors. + // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't + // assume every shader uses them all. + for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + if (!flags[Dirty::VertexFormat0 + index]) { + continue; } - } + flags[Dirty::VertexFormat0 + index] = false; + + const auto attrib = gpu.regs.vertex_attrib_format[index]; + const auto gl_index = static_cast<GLuint>(index); - // Rebinding the VAO invalidates the vertex buffer bindings. - gpu.dirty.ResetVertexArrays(); + // Ignore invalid attributes. + if (!attrib.IsValid()) { + glDisableVertexAttribArray(gl_index); + continue; + } + glEnableVertexAttribArray(gl_index); - state.draw.vertex_array = vao_entry.handle; - return vao_entry.handle; + if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt || + attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) { + glVertexAttribIFormat(gl_index, attrib.ComponentCount(), + MaxwellToGL::VertexType(attrib), attrib.offset); + } else { + glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), + attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); + } + glVertexAttribBinding(gl_index, attrib.buffer); + } } -void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { +void RasterizerOpenGL::SetupVertexBuffer() { auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.vertex_array_buffers) + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexBuffers]) { return; - gpu.dirty.vertex_array_buffers = false; - - const auto& regs = gpu.regs; + } + flags[Dirty::VertexBuffers] = false; MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty.vertex_array[index]) + const auto& regs = gpu.regs; + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (!flags[Dirty::VertexBuffer0 + index]) { continue; - gpu.dirty.vertex_array[index] = false; - gpu.dirty.vertex_instance[index] = false; + } + flags[Dirty::VertexBuffer0 + index] = false; const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) + if (!vertex_array.IsEnabled()) { continue; + } const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); @@ -209,42 +190,30 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); // Bind the vertex array to the buffer at the current offset. - vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset, - vertex_array.stride); - - if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { - // Enable vertex buffer instancing with the specified divisor. - glVertexArrayBindingDivisor(vao, index, vertex_array.divisor); - } else { - // Disable the vertex buffer instancing. - glVertexArrayBindingDivisor(vao, index, 0); - } + vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer, + vertex_buffer_offset, vertex_array.stride); } } -void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { +void RasterizerOpenGL::SetupVertexInstances() { auto& gpu = system.GPU().Maxwell3D(); - - if (!gpu.dirty.vertex_instances) + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexInstances]) { return; - gpu.dirty.vertex_instances = false; + } + flags[Dirty::VertexInstances] = false; const auto& regs = gpu.regs; - // Upload all guest vertex arrays sequentially to our buffer - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty.vertex_instance[index]) + for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + if (!flags[Dirty::VertexInstance0 + index]) { continue; - - gpu.dirty.vertex_instance[index] = false; - - if (regs.instanced_arrays.IsInstancingEnabled(index) && - regs.vertex_array[index].divisor != 0) { - // Enable vertex buffer instancing with the specified divisor. - glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); - } else { - // Disable the vertex buffer instancing. - glVertexArrayBindingDivisor(vao, index, 0); } + flags[Dirty::VertexInstance0 + index] = false; + + const auto gl_index = static_cast<GLuint>(index); + const bool instancing_enabled = regs.instanced_arrays.IsInstancingEnabled(gl_index); + const GLuint divisor = instancing_enabled ? regs.vertex_array[index].divisor : 0; + glVertexBindingDivisor(gl_index, divisor); } } @@ -260,8 +229,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); auto& gpu = system.GPU().Maxwell3D(); - - std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; @@ -271,10 +239,10 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { if (!gpu.regs.IsShaderConfigEnabled(index)) { switch (program) { case Maxwell::ShaderProgram::Geometry: - shader_program_manager->UseTrivialGeometryShader(); + program_manager.UseGeometryShader(0); break; case Maxwell::ShaderProgram::Fragment: - shader_program_manager->UseTrivialFragmentShader(); + program_manager.UseFragmentShader(0); break; default: break; @@ -299,19 +267,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { SetupDrawTextures(stage, shader); SetupDrawImages(stage, shader); - const ProgramVariant variant(primitive_mode); - const auto program_handle = shader->GetHandle(variant); - + const GLuint program_handle = shader->GetHandle(); switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: - shader_program_manager->UseProgrammableVertexShader(program_handle); + program_manager.UseVertexShader(program_handle); break; case Maxwell::ShaderProgram::Geometry: - shader_program_manager->UseProgrammableGeometryShader(program_handle); + program_manager.UseGeometryShader(program_handle); break; case Maxwell::ShaderProgram::Fragment: - shader_program_manager->UseProgrammableFragmentShader(program_handle); + program_manager.UseFragmentShader(program_handle); break; default: UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index, @@ -322,9 +288,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the // clip distances only when it's written by a shader stage. - for (std::size_t i = 0; i < Maxwell::NumClipDistances; ++i) { - clip_distances[i] = clip_distances[i] || shader->GetShaderEntries().clip_distances[i]; - } + clip_distances |= shader->GetEntries().clip_distances; // When VertexA is enabled, we have dual vertex shaders if (program == Maxwell::ShaderProgram::VertexA) { @@ -334,8 +298,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } SyncClipEnabled(clip_distances); - - gpu.dirty.shaders = false; + gpu.dirty.flags[Dirty::Shaders] = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -368,20 +331,23 @@ void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, shader_cache.LoadDiskCache(stop_loading, callback); } +void RasterizerOpenGL::SetupDirtyFlags() { + state_tracker.Initialize(); +} + void RasterizerOpenGL::ConfigureFramebuffers() { MICROPROFILE_SCOPE(OpenGL_Framebuffer); auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.render_settings) { + if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) { return; } - gpu.dirty.render_settings = false; + gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); View depth_surface = texture_cache.GetDepthBufferSurface(true); const auto& regs = gpu.regs; - state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); // Bind the framebuffer surfaces @@ -409,14 +375,11 @@ void RasterizerOpenGL::ConfigureFramebuffers() { texture_cache.GuardRenderTargets(false); - state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key); - SyncViewport(state); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } -void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, - bool using_depth_fb, bool using_stencil_fb) { - using VideoCore::Surface::SurfaceType; - +void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, + bool using_stencil_fb) { auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; @@ -435,80 +398,44 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, boo key.colors[0] = color_surface; key.zeta = depth_surface; - current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key); - current_state.ApplyFramebufferState(); + state_tracker.NotifyFramebuffer(); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } void RasterizerOpenGL::Clear() { - const auto& maxwell3d = system.GPU().Maxwell3D(); - - if (!maxwell3d.ShouldExecute()) { + const auto& gpu = system.GPU().Maxwell3D(); + if (!gpu.ShouldExecute()) { return; } - const auto& regs = maxwell3d.regs; + const auto& regs = gpu.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ - prev_state.AllDirty(); - prev_state.Apply(); - }); - - OpenGLState clear_state{OpenGLState::GetCurState()}; - clear_state.SetDefaultViewports(); if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; } if (use_color) { - clear_state.color_mask[0].red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE; + state_tracker.NotifyColorMask0(); + glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, + regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); + + // TODO(Rodrigo): Determine if clamping is used on clears + SyncFragmentColorClampState(); + SyncFramebufferSRGB(); } if (regs.clear_buffers.Z) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!"); use_depth = true; - // Always enable the depth write when clearing the depth buffer. The depth write mask is - // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to - // true. - clear_state.depth.test_enabled = true; - clear_state.depth.test_func = GL_ALWAYS; - clear_state.depth.write_mask = GL_TRUE; + state_tracker.NotifyDepthMask(); + glDepthMask(GL_TRUE); } if (regs.clear_buffers.S) { - ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); + ASSERT_MSG(regs.zeta_enable, "Tried to clear stencil but buffer is not enabled!"); use_stencil = true; - clear_state.stencil.test_enabled = true; - - if (regs.clear_flags.stencil) { - // Stencil affects the clear so fill it with the used masks - clear_state.stencil.front.test_func = GL_ALWAYS; - clear_state.stencil.front.test_mask = regs.stencil_front_func_mask; - clear_state.stencil.front.action_stencil_fail = GL_KEEP; - clear_state.stencil.front.action_depth_fail = GL_KEEP; - clear_state.stencil.front.action_depth_pass = GL_KEEP; - clear_state.stencil.front.write_mask = regs.stencil_front_mask; - if (regs.stencil_two_side_enable) { - clear_state.stencil.back.test_func = GL_ALWAYS; - clear_state.stencil.back.test_mask = regs.stencil_back_func_mask; - clear_state.stencil.back.action_stencil_fail = GL_KEEP; - clear_state.stencil.back.action_depth_fail = GL_KEEP; - clear_state.stencil.back.action_depth_pass = GL_KEEP; - clear_state.stencil.back.write_mask = regs.stencil_back_mask; - } else { - clear_state.stencil.back.test_func = GL_ALWAYS; - clear_state.stencil.back.test_mask = 0xFFFFFFFF; - clear_state.stencil.back.write_mask = 0xFFFFFFFF; - clear_state.stencil.back.action_stencil_fail = GL_KEEP; - clear_state.stencil.back.action_depth_fail = GL_KEEP; - clear_state.stencil.back.action_depth_pass = GL_KEEP; - } - } } if (!use_color && !use_depth && !use_stencil) { @@ -516,20 +443,18 @@ void RasterizerOpenGL::Clear() { return; } - ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + SyncRasterizeEnable(); - SyncViewport(clear_state); - SyncRasterizeEnable(clear_state); if (regs.clear_flags.scissor) { - SyncScissorTest(clear_state); + SyncScissorTest(); + } else { + state_tracker.NotifyScissor0(); + glDisablei(GL_SCISSOR_TEST, 0); } - if (regs.clear_flags.viewport) { - clear_state.EmulateViewportWithScissor(); - } + UNIMPLEMENTED_IF(regs.clear_flags.viewport); - clear_state.AllDirty(); - clear_state.Apply(); + ConfigureClearFramebuffer(use_color, use_depth, use_stencil); if (use_color) { glClearBufferfv(GL_COLOR, 0, regs.clear_color); @@ -549,25 +474,27 @@ void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; query_cache.UpdateCounters(); - SyncRasterizeEnable(state); + SyncViewport(); + SyncRasterizeEnable(); + SyncPolygonModes(); SyncColorMask(); SyncFragmentColorClampState(); SyncMultiSampleState(); SyncDepthTestState(); + SyncDepthClamp(); SyncStencilTestState(); SyncBlendState(); SyncLogicOpState(); SyncCullMode(); SyncPrimitiveRestart(); - SyncScissorTest(state); - SyncTransformFeedback(); + SyncScissorTest(); SyncPointState(); SyncPolygonOffset(); SyncAlphaTest(); + SyncFramebufferSRGB(); buffer_cache.Acquire(); @@ -591,14 +518,13 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { buffer_cache.Map(buffer_size); // Prepare vertex array format. - const GLuint vao = SetupVertexFormat(); - vertex_array_pushbuffer.Setup(vao); + SetupVertexFormat(); + vertex_array_pushbuffer.Setup(); // Upload vertex and index data. - SetupVertexBuffer(vao); - SetupVertexInstances(vao); - - GLintptr index_buffer_offset; + SetupVertexBuffer(); + SetupVertexInstances(); + GLintptr index_buffer_offset = 0; if (is_indexed) { index_buffer_offset = SetupIndexBuffer(); } @@ -624,27 +550,20 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { ConfigureFramebuffers(); // Signal the buffer cache that we are not going to upload more things. - const bool invalidate = buffer_cache.Unmap(); + buffer_cache.Unmap(); // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. vertex_array_pushbuffer.Bind(); bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); - if (invalidate) { - // As all cached buffers are invalidated, we need to recheck their state. - gpu.dirty.ResetVertexArrays(); - } - gpu.dirty.memory_general = false; - - shader_program_manager->ApplyTo(state); - state.Apply(); + program_manager.BindGraphicsPipeline(); if (texture_cache.TextureBarrier()) { glTextureBarrier(); } - ++num_queued_commands; + BeginTransformFeedback(primitive_mode); const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); const GLsizei num_instances = @@ -683,6 +602,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { num_instances, base_instance); } } + + EndTransformFeedback(); + + ++num_queued_commands; } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { @@ -695,13 +618,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { auto kernel = shader_cache.GetComputeKernel(code_addr); SetupComputeTextures(kernel); SetupComputeImages(kernel); - - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y, - launch_desc.block_dim_z, launch_desc.shared_alloc, - launch_desc.local_pos_alloc); - state.draw.shader_program = kernel->GetHandle(variant); - state.draw.program_pipeline = 0; + program_manager.BindComputeShader(kernel->GetHandle()); const std::size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * @@ -719,11 +636,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); - state.ApplyTextures(); - state.ApplyImages(); - state.ApplyShaderProgram(); - state.ApplyProgramPipeline(); - + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -828,7 +741,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad const auto& shader_stage = stages[stage_index]; u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetShaderEntries().const_buffers) { + for (const auto& entry : shader->GetEntries().const_buffers) { const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; SetupConstBuffer(binding++, buffer, entry); } @@ -839,7 +752,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { const auto& launch_desc = system.GPU().KeplerCompute().launch_description; u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().const_buffers) { + for (const auto& entry : kernel->GetEntries().const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; @@ -851,7 +764,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { } void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry) { + const ConstBufferEntry& entry) { if (!buffer.enabled) { // Set values to zero to unbind buffers bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, @@ -875,7 +788,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { + for (const auto& entry : shader->GetEntries().global_memory_entries) { const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; const auto gpu_addr{memory_manager.Read<u64>(addr)}; const auto size{memory_manager.Read<u32>(addr + 8)}; @@ -889,7 +802,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { + for (const auto& entry : kernel->GetEntries().global_memory_entries) { const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; const auto gpu_addr{memory_manager.Read<u64>(addr)}; const auto size{memory_manager.Read<u32>(addr + 8)}; @@ -897,7 +810,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { } } -void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, +void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size) { const auto alignment{device.GetShaderStorageBufferAlignment()}; const auto [ssbo, buffer_offset] = @@ -909,16 +822,11 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& MICROPROFILE_SCOPE(OpenGL_Texture); const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; - for (const auto& entry : shader->GetShaderEntries().samplers) { - const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); - if (!entry.IsIndexed()) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type); + for (const auto& entry : shader->GetEntries().samplers) { + const auto shader_type = static_cast<ShaderType>(stage_index); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); SetupTexture(binding++, texture, entry); - } else { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); - SetupTexture(binding++, texture, entry); - } } } } @@ -927,46 +835,39 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().samplers) { - if (!entry.IsIndexed()) { - const auto texture = - GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute); + for (const auto& entry : kernel->GetEntries().samplers) { + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); SetupTexture(binding++, texture, entry); - } else { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = - GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i); - SetupTexture(binding++, texture, entry); - } } } } void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const GLShader::SamplerEntry& entry) { + const SamplerEntry& entry) { const auto view = texture_cache.GetTextureSurface(texture.tic, entry); if (!view) { // Can occur when texture addr is null or its memory is unmapped/invalid - state.samplers[binding] = 0; - state.textures[binding] = 0; + glBindSampler(binding, 0); + glBindTextureUnit(binding, 0); return; } - state.textures[binding] = view->GetTexture(); + glBindTextureUnit(binding, view->GetTexture()); if (view->GetSurfaceParams().IsBuffer()) { return; } - state.samplers[binding] = sampler_cache.GetSampler(texture.tsc); - // Apply swizzle to textures that are not buffers. view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, texture.tic.w_source); + + glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).image; - for (const auto& entry : shader->GetShaderEntries().images) { + for (const auto& entry : shader->GetEntries().images) { const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; SetupImage(binding++, tic, entry); @@ -976,17 +877,17 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; - for (const auto& entry : shader->GetShaderEntries().images) { + for (const auto& entry : shader->GetEntries().images) { const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; SetupImage(binding++, tic, entry); } } void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const GLShader::ImageEntry& entry) { + const ImageEntry& entry) { const auto view = texture_cache.GetImageSurface(tic, entry); if (!view) { - state.images[binding] = 0; + glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); return; } if (!tic.IsBuffer()) { @@ -995,55 +896,85 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t if (entry.IsWritten()) { view->MarkAsModified(texture_cache.Tick()); } - state.images[binding] = view->GetTexture(); + glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE, + view->GetFormat()); } -void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - const bool geometry_shaders_enabled = - regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); - const std::size_t viewport_count = - geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; - for (std::size_t i = 0; i < viewport_count; i++) { - auto& viewport = current_state.viewports[i]; - const auto& src = regs.viewports[i]; - const Common::Rectangle<s32> viewport_rect{regs.viewport_transform[i].GetRect()}; - viewport.x = viewport_rect.left; - viewport.y = viewport_rect.bottom; - viewport.width = viewport_rect.GetWidth(); - viewport.height = viewport_rect.GetHeight(); - viewport.depth_range_far = src.depth_range_far; - viewport.depth_range_near = src.depth_range_near; - } - state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0; - state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0; - - bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { - flip_y = !flip_y; - } - if (regs.screen_y_control.y_negate != 0) { - flip_y = !flip_y; - } - state.clip_control.origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT; - state.clip_control.depth_mode = - regs.depth_mode == Tegra::Engines::Maxwell3D::Regs::DepthMode::ZeroToOne - ? GL_ZERO_TO_ONE - : GL_NEGATIVE_ONE_TO_ONE; +void RasterizerOpenGL::SyncViewport() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; + + const bool dirty_viewport = flags[Dirty::Viewports]; + if (dirty_viewport || flags[Dirty::ClipControl]) { + flags[Dirty::ClipControl] = false; + + bool flip_y = false; + if (regs.viewport_transform[0].scale_y < 0.0) { + flip_y = !flip_y; + } + if (regs.screen_y_control.y_negate != 0) { + flip_y = !flip_y; + } + glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, + regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE + : GL_NEGATIVE_ONE_TO_ONE); + } + + if (dirty_viewport) { + flags[Dirty::Viewports] = false; + + const bool force = flags[Dirty::ViewportTransform]; + flags[Dirty::ViewportTransform] = false; + + for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) { + if (!force && !flags[Dirty::Viewport0 + i]) { + continue; + } + flags[Dirty::Viewport0 + i] = false; + + const Common::Rectangle<f32> rect{regs.viewport_transform[i].GetRect()}; + glViewportIndexedf(static_cast<GLuint>(i), rect.left, rect.bottom, rect.GetWidth(), + rect.GetHeight()); + + const auto& src = regs.viewports[i]; + glDepthRangeIndexed(static_cast<GLuint>(i), static_cast<GLdouble>(src.depth_range_near), + static_cast<GLdouble>(src.depth_range_far)); + } + } } -void RasterizerOpenGL::SyncClipEnabled( - const std::array<bool, Maxwell::Regs::NumClipDistances>& clip_mask) { +void RasterizerOpenGL::SyncDepthClamp() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::DepthClampEnabled]) { + return; + } + flags[Dirty::DepthClampEnabled] = false; - const auto& regs = system.GPU().Maxwell3D().regs; - const std::array<bool, Maxwell::Regs::NumClipDistances> reg_state{ - regs.clip_distance_enabled.c0 != 0, regs.clip_distance_enabled.c1 != 0, - regs.clip_distance_enabled.c2 != 0, regs.clip_distance_enabled.c3 != 0, - regs.clip_distance_enabled.c4 != 0, regs.clip_distance_enabled.c5 != 0, - regs.clip_distance_enabled.c6 != 0, regs.clip_distance_enabled.c7 != 0}; + const auto& state = gpu.regs.view_volume_clip_control; + UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near, + "Unimplemented depth clamp separation!"); + + oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near); +} + +void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) { + return; + } + flags[Dirty::ClipDistances] = false; + + clip_mask &= gpu.regs.clip_distance_enabled; + if (clip_mask == last_clip_distance_mask) { + return; + } + last_clip_distance_mask = clip_mask; for (std::size_t i = 0; i < Maxwell::Regs::NumClipDistances; ++i) { - state.clip_distance[i] = reg_state[i] && clip_mask[i]; + oglEnable(static_cast<GLenum>(GL_CLIP_DISTANCE0 + i), (clip_mask >> i) & 1); } } @@ -1052,247 +983,442 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; - state.cull.enabled = regs.cull.enabled != 0; - if (state.cull.enabled) { - state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); + if (flags[Dirty::CullTest]) { + flags[Dirty::CullTest] = false; + + if (regs.cull_test_enabled) { + glEnable(GL_CULL_FACE); + glCullFace(MaxwellToGL::CullFace(regs.cull_face)); + } else { + glDisable(GL_CULL_FACE); + } } - state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face); + if (flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); + } } void RasterizerOpenGL::SyncPrimitiveRestart() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PrimitiveRestart]) { + return; + } + flags[Dirty::PrimitiveRestart] = false; - state.primitive_restart.enabled = regs.primitive_restart.enabled; - state.primitive_restart.index = regs.primitive_restart.index; + if (gpu.regs.primitive_restart.enabled) { + glEnable(GL_PRIMITIVE_RESTART); + glPrimitiveRestartIndex(gpu.regs.primitive_restart.index); + } else { + glDisable(GL_PRIMITIVE_RESTART); + } } void RasterizerOpenGL::SyncDepthTestState() { - const auto& regs = system.GPU().Maxwell3D().regs; - - state.depth.test_enabled = regs.depth_test_enable != 0; - state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; - if (!state.depth.test_enabled) { - return; + const auto& regs = gpu.regs; + if (flags[Dirty::DepthMask]) { + flags[Dirty::DepthMask] = false; + glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE); } - state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func); + if (flags[Dirty::DepthTest]) { + flags[Dirty::DepthTest] = false; + if (regs.depth_test_enable) { + glEnable(GL_DEPTH_TEST); + glDepthFunc(MaxwellToGL::ComparisonOp(regs.depth_test_func)); + } else { + glDisable(GL_DEPTH_TEST); + } + } } void RasterizerOpenGL::SyncStencilTestState() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.stencil_test) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::StencilTest]) { return; } - maxwell3d.dirty.stencil_test = false; - - const auto& regs = maxwell3d.regs; - state.stencil.test_enabled = regs.stencil_enable != 0; - state.MarkDirtyStencilState(); + flags[Dirty::StencilTest] = false; + const auto& regs = gpu.regs; if (!regs.stencil_enable) { + glDisable(GL_STENCIL_TEST); return; } - state.stencil.front.test_func = MaxwellToGL::ComparisonOp(regs.stencil_front_func_func); - state.stencil.front.test_ref = regs.stencil_front_func_ref; - state.stencil.front.test_mask = regs.stencil_front_func_mask; - state.stencil.front.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_fail); - state.stencil.front.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_zfail); - state.stencil.front.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_front_op_zpass); - state.stencil.front.write_mask = regs.stencil_front_mask; + glEnable(GL_STENCIL_TEST); + glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func), + regs.stencil_front_func_ref, regs.stencil_front_func_mask); + glStencilOpSeparate(GL_FRONT, MaxwellToGL::StencilOp(regs.stencil_front_op_fail), + MaxwellToGL::StencilOp(regs.stencil_front_op_zfail), + MaxwellToGL::StencilOp(regs.stencil_front_op_zpass)); + glStencilMaskSeparate(GL_FRONT, regs.stencil_front_mask); + if (regs.stencil_two_side_enable) { - state.stencil.back.test_func = MaxwellToGL::ComparisonOp(regs.stencil_back_func_func); - state.stencil.back.test_ref = regs.stencil_back_func_ref; - state.stencil.back.test_mask = regs.stencil_back_func_mask; - state.stencil.back.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_fail); - state.stencil.back.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_zfail); - state.stencil.back.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_back_op_zpass); - state.stencil.back.write_mask = regs.stencil_back_mask; + glStencilFuncSeparate(GL_BACK, MaxwellToGL::ComparisonOp(regs.stencil_back_func_func), + regs.stencil_back_func_ref, regs.stencil_back_func_mask); + glStencilOpSeparate(GL_BACK, MaxwellToGL::StencilOp(regs.stencil_back_op_fail), + MaxwellToGL::StencilOp(regs.stencil_back_op_zfail), + MaxwellToGL::StencilOp(regs.stencil_back_op_zpass)); + glStencilMaskSeparate(GL_BACK, regs.stencil_back_mask); } else { - state.stencil.back.test_func = GL_ALWAYS; - state.stencil.back.test_ref = 0; - state.stencil.back.test_mask = 0xFFFFFFFF; - state.stencil.back.write_mask = 0xFFFFFFFF; - state.stencil.back.action_stencil_fail = GL_KEEP; - state.stencil.back.action_depth_fail = GL_KEEP; - state.stencil.back.action_depth_pass = GL_KEEP; + glStencilFuncSeparate(GL_BACK, GL_ALWAYS, 0, 0xFFFFFFFF); + glStencilOpSeparate(GL_BACK, GL_KEEP, GL_KEEP, GL_KEEP); + glStencilMaskSeparate(GL_BACK, 0xFFFFFFFF); } } -void RasterizerOpenGL::SyncRasterizeEnable(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - current_state.rasterizer_discard = regs.rasterize_enable == 0; +void RasterizerOpenGL::SyncRasterizeEnable() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::RasterizeEnable]) { + return; + } + flags[Dirty::RasterizeEnable] = false; + + oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); +} + +void RasterizerOpenGL::SyncPolygonModes() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PolygonModes]) { + return; + } + flags[Dirty::PolygonModes] = false; + + if (gpu.regs.fill_rectangle) { + if (!GLAD_GL_NV_fill_rectangle) { + LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported"); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + return; + } + + flags[Dirty::PolygonModeFront] = true; + flags[Dirty::PolygonModeBack] = true; + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL_RECTANGLE_NV); + return; + } + + if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) { + flags[Dirty::PolygonModeFront] = false; + flags[Dirty::PolygonModeBack] = false; + glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + return; + } + + if (flags[Dirty::PolygonModeFront]) { + flags[Dirty::PolygonModeFront] = false; + glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + } + + if (flags[Dirty::PolygonModeBack]) { + flags[Dirty::PolygonModeBack] = false; + glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back)); + } } void RasterizerOpenGL::SyncColorMask() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.color_mask) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::ColorMasks]) { return; } - const auto& regs = maxwell3d.regs; + flags[Dirty::ColorMasks] = false; + + const bool force = flags[Dirty::ColorMaskCommon]; + flags[Dirty::ColorMaskCommon] = false; + + const auto& regs = gpu.regs; + if (regs.color_mask_common) { + if (!force && !flags[Dirty::ColorMask0]) { + return; + } + flags[Dirty::ColorMask0] = false; - const std::size_t count = - regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; - for (std::size_t i = 0; i < count; i++) { - const auto& source = regs.color_mask[regs.color_mask_common ? 0 : i]; - auto& dest = state.color_mask[i]; - dest.red_enabled = (source.R == 0) ? GL_FALSE : GL_TRUE; - dest.green_enabled = (source.G == 0) ? GL_FALSE : GL_TRUE; - dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; - dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; + auto& mask = regs.color_mask[0]; + glColorMask(mask.R != 0, mask.B != 0, mask.G != 0, mask.A != 0); + return; } - state.MarkDirtyColorMask(); - maxwell3d.dirty.color_mask = false; + // Path without color_mask_common set + for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { + if (!force && !flags[Dirty::ColorMask0 + i]) { + continue; + } + flags[Dirty::ColorMask0 + i] = false; + + const auto& mask = regs.color_mask[i]; + glColorMaski(static_cast<GLuint>(i), mask.R != 0, mask.G != 0, mask.B != 0, mask.A != 0); + } } void RasterizerOpenGL::SyncMultiSampleState() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::MultisampleControl]) { + return; + } + flags[Dirty::MultisampleControl] = false; + const auto& regs = system.GPU().Maxwell3D().regs; - state.multisample_control.alpha_to_coverage = regs.multisample_control.alpha_to_coverage != 0; - state.multisample_control.alpha_to_one = regs.multisample_control.alpha_to_one != 0; + oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage); + oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one); } void RasterizerOpenGL::SyncFragmentColorClampState() { - const auto& regs = system.GPU().Maxwell3D().regs; - state.fragment_color_clamp.enabled = regs.frag_color_clamp != 0; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::FragmentClampColor]) { + return; + } + flags[Dirty::FragmentClampColor] = false; + + glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); } void RasterizerOpenGL::SyncBlendState() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.blend_state) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; + + if (flags[Dirty::BlendColor]) { + flags[Dirty::BlendColor] = false; + glBlendColor(regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, + regs.blend_color.a); + } + + // TODO(Rodrigo): Revisit blending, there are several registers we are not reading + + if (!flags[Dirty::BlendStates]) { return; } - const auto& regs = maxwell3d.regs; - - state.blend_color.red = regs.blend_color.r; - state.blend_color.green = regs.blend_color.g; - state.blend_color.blue = regs.blend_color.b; - state.blend_color.alpha = regs.blend_color.a; - - state.independant_blend.enabled = regs.independent_blend_enable; - if (!state.independant_blend.enabled) { - auto& blend = state.blend[0]; - const auto& src = regs.blend; - blend.enabled = src.enable[0] != 0; - if (blend.enabled) { - blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); - blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); - blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); - blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); - blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); - blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); - } - for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - state.blend[i].enabled = false; + flags[Dirty::BlendStates] = false; + + if (!regs.independent_blend_enable) { + if (!regs.blend.enable[0]) { + glDisable(GL_BLEND); + return; } - maxwell3d.dirty.blend_state = false; - state.MarkDirtyBlendState(); + glEnable(GL_BLEND); + glBlendFuncSeparate(MaxwellToGL::BlendFunc(regs.blend.factor_source_rgb), + MaxwellToGL::BlendFunc(regs.blend.factor_dest_rgb), + MaxwellToGL::BlendFunc(regs.blend.factor_source_a), + MaxwellToGL::BlendFunc(regs.blend.factor_dest_a)); + glBlendEquationSeparate(MaxwellToGL::BlendEquation(regs.blend.equation_rgb), + MaxwellToGL::BlendEquation(regs.blend.equation_a)); return; } - for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - auto& blend = state.blend[i]; - const auto& src = regs.independent_blend[i]; - blend.enabled = regs.blend.enable[i] != 0; - if (!blend.enabled) + const bool force = flags[Dirty::BlendIndependentEnabled]; + flags[Dirty::BlendIndependentEnabled] = false; + + for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { + if (!force && !flags[Dirty::BlendState0 + i]) { continue; - blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); - blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); - blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); - blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); - blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); - blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); - } + } + flags[Dirty::BlendState0 + i] = false; + + if (!regs.blend.enable[i]) { + glDisablei(GL_BLEND, static_cast<GLuint>(i)); + continue; + } + glEnablei(GL_BLEND, static_cast<GLuint>(i)); - state.MarkDirtyBlendState(); - maxwell3d.dirty.blend_state = false; + const auto& src = regs.independent_blend[i]; + glBlendFuncSeparatei(static_cast<GLuint>(i), MaxwellToGL::BlendFunc(src.factor_source_rgb), + MaxwellToGL::BlendFunc(src.factor_dest_rgb), + MaxwellToGL::BlendFunc(src.factor_source_a), + MaxwellToGL::BlendFunc(src.factor_dest_a)); + glBlendEquationSeparatei(static_cast<GLuint>(i), + MaxwellToGL::BlendEquation(src.equation_rgb), + MaxwellToGL::BlendEquation(src.equation_a)); + } } void RasterizerOpenGL::SyncLogicOpState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::LogicOp]) { + return; + } + flags[Dirty::LogicOp] = false; - state.logic_op.enabled = regs.logic_op.enable != 0; + const auto& regs = gpu.regs; + if (regs.logic_op.enable) { + glEnable(GL_COLOR_LOGIC_OP); + glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation)); + } else { + glDisable(GL_COLOR_LOGIC_OP); + } +} - if (!state.logic_op.enabled) +void RasterizerOpenGL::SyncScissorTest() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::Scissors]) { return; + } + flags[Dirty::Scissors] = false; - ASSERT_MSG(regs.blend.enable[0] == 0, - "Blending and logic op can't be enabled at the same time."); - - state.logic_op.operation = MaxwellToGL::LogicOp(regs.logic_op.operation); -} + const auto& regs = gpu.regs; + for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { + if (!flags[Dirty::Scissor0 + index]) { + continue; + } + flags[Dirty::Scissor0 + index] = false; -void RasterizerOpenGL::SyncScissorTest(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - const bool geometry_shaders_enabled = - regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); - const std::size_t viewport_count = - geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; - for (std::size_t i = 0; i < viewport_count; i++) { - const auto& src = regs.scissor_test[i]; - auto& dst = current_state.viewports[i].scissor; - dst.enabled = (src.enable != 0); - if (dst.enabled == 0) { - return; + const auto& src = regs.scissor_test[index]; + if (src.enable) { + glEnablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); + glScissorIndexed(static_cast<GLuint>(index), src.min_x, src.min_y, + src.max_x - src.min_x, src.max_y - src.min_y); + } else { + glDisablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); } - const u32 width = src.max_x - src.min_x; - const u32 height = src.max_y - src.min_y; - dst.x = src.min_x; - dst.y = src.min_y; - dst.width = width; - dst.height = height; } } -void RasterizerOpenGL::SyncTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; - UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented"); -} - void RasterizerOpenGL::SyncPointState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PointSize]) { + return; + } + flags[Dirty::PointSize] = false; + + oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable); + + if (gpu.regs.vp_point_size.enable) { + // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. + glEnable(GL_PROGRAM_POINT_SIZE); + return; + } + // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). - state.point.program_control = regs.vp_point_size.enable != 0; - state.point.sprite = regs.point_sprite_enable != 0; - state.point.size = std::max(1.0f, regs.point_size); + glPointSize(std::max(1.0f, gpu.regs.point_size)); + glDisable(GL_PROGRAM_POINT_SIZE); } void RasterizerOpenGL::SyncPolygonOffset() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.polygon_offset) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PolygonOffset]) { return; } - const auto& regs = maxwell3d.regs; - - state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; - state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; - state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; + flags[Dirty::PolygonOffset] = false; - // Hardware divides polygon offset units by two - state.polygon_offset.units = regs.polygon_offset_units / 2.0f; - state.polygon_offset.factor = regs.polygon_offset_factor; - state.polygon_offset.clamp = regs.polygon_offset_clamp; + const auto& regs = gpu.regs; + oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable); + oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable); + oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable); - state.MarkDirtyPolygonOffset(); - maxwell3d.dirty.polygon_offset = false; + if (regs.polygon_offset_fill_enable || regs.polygon_offset_line_enable || + regs.polygon_offset_point_enable) { + // Hardware divides polygon offset units by two + glPolygonOffsetClamp(regs.polygon_offset_factor, regs.polygon_offset_units / 2.0f, + regs.polygon_offset_clamp); + } } void RasterizerOpenGL::SyncAlphaTest() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::AlphaTest]) { + return; + } + flags[Dirty::AlphaTest] = false; + + const auto& regs = gpu.regs; + if (regs.alpha_test_enabled && regs.rt_control.count > 1) { + LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); + } + + if (regs.alpha_test_enabled) { + glEnable(GL_ALPHA_TEST); + glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); + } else { + glDisable(GL_ALPHA_TEST); + } +} + +void RasterizerOpenGL::SyncFramebufferSRGB() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::FramebufferSRGB]) { + return; + } + flags[Dirty::FramebufferSRGB] = false; + + oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); +} + +void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { const auto& regs = system.GPU().Maxwell3D().regs; - UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1, - "Alpha Testing is enabled with more than one rendertarget"); + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - state.alpha_test.enabled = regs.alpha_test_enabled; - if (!state.alpha_test.enabled) { + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + if (enabled_transform_feedback_buffers[index]) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0, + 0); + } + enabled_transform_feedback_buffers[index] = false; + continue; + } + enabled_transform_feedback_buffers[index] = true; + + auto& tfb_buffer = transform_feedback_buffers[index]; + tfb_buffer.Create(); + + const GLuint handle = tfb_buffer.handle; + const std::size_t size = binding.buffer_size; + glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY); + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0, + static_cast<GLsizeiptr>(size)); + } + + glBeginTransformFeedback(GL_POINTS); +} + +void RasterizerOpenGL::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { return; } - state.alpha_test.func = MaxwellToGL::ComparisonOp(regs.alpha_test_func); - state.alpha_test.ref = regs.alpha_test_ref; + + glEndTransformFeedback(); + + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + continue; + } + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GLuint handle = transform_feedback_buffers[index].handle; + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 68abe9a21..2d3be2437 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -30,7 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" #include "video_core/textures/texture.h" @@ -55,7 +55,8 @@ struct DrawParameters; class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info); + ScreenInfo& info, GLShader::ProgramManager& program_manager, + StateTracker& state_tracker); ~RasterizerOpenGL() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -76,6 +77,7 @@ public: u32 pixel_stride) override; void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; + void SetupDirtyFlags() override; /// Returns true when there are commands queued to the OpenGL server. bool AnyCommandQueued() const { @@ -86,8 +88,7 @@ private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); - void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, - bool using_depth_fb, bool using_stencil_fb); + void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb); /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); @@ -97,7 +98,7 @@ private: /// Configures a constant buffer. void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry); + const ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); @@ -106,7 +107,7 @@ private: void SetupComputeGlobalMemory(const Shader& kernel); /// Configures a constant buffer. - void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, + void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); /// Configures the current textures to use for the draw command. @@ -117,7 +118,7 @@ private: /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const GLShader::SamplerEntry& entry); + const SamplerEntry& entry); /// Configures images in a graphics shader. void SetupDrawImages(std::size_t stage_index, const Shader& shader); @@ -126,15 +127,16 @@ private: void SetupComputeImages(const Shader& shader); /// Configures an image. - void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const GLShader::ImageEntry& entry); + void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); /// Syncs the viewport and depth range to match the guest state - void SyncViewport(OpenGLState& current_state); + void SyncViewport(); + + /// Syncs the depth clamp state + void SyncDepthClamp(); /// Syncs the clip enabled status to match the guest state - void SyncClipEnabled( - const std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances>& clip_mask); + void SyncClipEnabled(u32 clip_mask); /// Syncs the clip coefficients to match the guest state void SyncClipCoef(); @@ -164,16 +166,16 @@ private: void SyncMultiSampleState(); /// Syncs the scissor test state to match the guest state - void SyncScissorTest(OpenGLState& current_state); - - /// Syncs the transform feedback state to match the guest state - void SyncTransformFeedback(); + void SyncScissorTest(); /// Syncs the point state to match the guest state void SyncPointState(); /// Syncs the rasterizer enable state to match the guest state - void SyncRasterizeEnable(OpenGLState& current_state); + void SyncRasterizeEnable(); + + /// Syncs polygon modes to match the guest state + void SyncPolygonModes(); /// Syncs Color Mask void SyncColorMask(); @@ -184,6 +186,15 @@ private: /// Syncs the alpha test state to match the guest state void SyncAlphaTest(); + /// Syncs the framebuffer sRGB state to match the guest state + void SyncFramebufferSRGB(); + + /// Begin a transform feedback + void BeginTransformFeedback(GLenum primitive_mode); + + /// End a transform feedback + void EndTransformFeedback(); + /// Check for extension that are not strictly required but are needed for correct emulation void CheckExtensions(); @@ -191,18 +202,17 @@ private: std::size_t CalculateIndexBufferSize() const; - /// Updates and returns a vertex array object representing current vertex format - GLuint SetupVertexFormat(); + /// Updates the current vertex format + void SetupVertexFormat(); - void SetupVertexBuffer(GLuint vao); - void SetupVertexInstances(GLuint vao); + void SetupVertexBuffer(); + void SetupVertexInstances(); GLintptr SetupIndexBuffer(); void SetupShaders(GLenum primitive_mode); const Device device; - OpenGLState state; TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; @@ -212,22 +222,25 @@ private: Core::System& system; ScreenInfo& screen_info; - - std::unique_ptr<GLShader::ProgramManager> shader_program_manager; - std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute, - Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>, - OGLVertexArray> - vertex_array_cache; + GLShader::ProgramManager& program_manager; + StateTracker& state_tracker; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; - VertexArrayPushBuffer vertex_array_pushbuffer; + VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker}; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + transform_feedback_buffers; + std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + enabled_transform_feedback_buffers; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; + + u32 last_clip_distance_mask = 0; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index c0aee770f..97803d480 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -8,7 +8,6 @@ #include "common/microprofile.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" -#include "video_core/renderer_opengl/gl_state.h" MICROPROFILE_DEFINE(OpenGL_ResourceCreation, "OpenGL", "Resource Creation", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_RGB(128, 128, 192)); @@ -20,7 +19,7 @@ void OGLRenderbuffer::Create() { return; MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glGenRenderbuffers(1, &handle); + glCreateRenderbuffers(1, &handle); } void OGLRenderbuffer::Release() { @@ -29,7 +28,6 @@ void OGLRenderbuffer::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteRenderbuffers(1, &handle); - OpenGLState::GetCurState().ResetRenderbuffer(handle).Apply(); handle = 0; } @@ -47,7 +45,6 @@ void OGLTexture::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteTextures(1, &handle); - OpenGLState::GetCurState().UnbindTexture(handle).Apply(); handle = 0; } @@ -65,7 +62,6 @@ void OGLTextureView::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteTextures(1, &handle); - OpenGLState::GetCurState().UnbindTexture(handle).Apply(); handle = 0; } @@ -83,7 +79,6 @@ void OGLSampler::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteSamplers(1, &handle); - OpenGLState::GetCurState().ResetSampler(handle).Apply(); handle = 0; } @@ -127,7 +122,6 @@ void OGLProgram::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteProgram(handle); - OpenGLState::GetCurState().ResetProgram(handle).Apply(); handle = 0; } @@ -145,7 +139,6 @@ void OGLPipeline::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteProgramPipelines(1, &handle); - OpenGLState::GetCurState().ResetPipeline(handle).Apply(); handle = 0; } @@ -189,24 +182,6 @@ void OGLSync::Release() { handle = 0; } -void OGLVertexArray::Create() { - if (handle != 0) - return; - - MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glCreateVertexArrays(1, &handle); -} - -void OGLVertexArray::Release() { - if (handle == 0) - return; - - MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); - glDeleteVertexArrays(1, &handle); - OpenGLState::GetCurState().ResetVertexArray(handle).Apply(); - handle = 0; -} - void OGLFramebuffer::Create() { if (handle != 0) return; @@ -221,7 +196,6 @@ void OGLFramebuffer::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteFramebuffers(1, &handle); - OpenGLState::GetCurState().ResetFramebuffer(handle).Apply(); handle = 0; } diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 995a4e45e..de93f4212 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -241,31 +241,6 @@ public: GLsync handle = 0; }; -class OGLVertexArray : private NonCopyable { -public: - OGLVertexArray() = default; - - OGLVertexArray(OGLVertexArray&& o) noexcept : handle(std::exchange(o.handle, 0)) {} - - ~OGLVertexArray() { - Release(); - } - - OGLVertexArray& operator=(OGLVertexArray&& o) noexcept { - Release(); - handle = std::exchange(o.handle, 0); - return *this; - } - - /// Creates a new internal OpenGL resource and stores the handle - void Create(); - - /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle = 0; -}; - class OGLFramebuffer : private NonCopyable { public: OGLFramebuffer() = default; diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.cpp b/src/video_core/renderer_opengl/gl_sampler_cache.cpp index 3ded5ecea..5c174879a 100644 --- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp +++ b/src/video_core/renderer_opengl/gl_sampler_cache.cpp @@ -38,7 +38,7 @@ OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy()); } else if (GLAD_GL_EXT_texture_filter_anisotropic) { glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy()); - } else if (tsc.GetMaxAnisotropy() != 1) { + } else { LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver"); } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 489eb143c..e3d31c3eb 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -2,12 +2,16 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <atomic> +#include <functional> #include <mutex> #include <optional> #include <string> #include <thread> #include <unordered_set> + #include <boost/functional/hash.hpp> + #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -22,14 +26,16 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace OpenGL { using Tegra::Engines::ShaderType; -using VideoCommon::Shader::ConstBufferLocker; using VideoCommon::Shader::ProgramCode; +using VideoCommon::Shader::Registry; using VideoCommon::Shader::ShaderIR; namespace { @@ -55,7 +61,7 @@ constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { } /// Calculates the size of a program stream -std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { +std::size_t CalculateProgramSize(const ProgramCode& program) { constexpr std::size_t start_offset = 10; // This is the encoded version of BRA that jumps to itself. All Nvidia // shaders end with one. @@ -108,32 +114,9 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) { } } -/// Describes primitive behavior on geometry shaders -constexpr std::pair<const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) { - switch (primitive_mode) { - case GL_POINTS: - return {"points", 1}; - case GL_LINES: - case GL_LINE_STRIP: - return {"lines", 2}; - case GL_LINES_ADJACENCY: - case GL_LINE_STRIP_ADJACENCY: - return {"lines_adjacency", 4}; - case GL_TRIANGLES: - case GL_TRIANGLE_STRIP: - case GL_TRIANGLE_FAN: - return {"triangles", 3}; - case GL_TRIANGLES_ADJACENCY: - case GL_TRIANGLE_STRIP_ADJACENCY: - return {"triangles_adjacency", 6}; - default: - return {"points", 1}; - } -} - /// Hashes one (or two) program streams u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code, - const ProgramCode& code_b) { + const ProgramCode& code_b = {}) { u64 unique_identifier = boost::hash_value(code); if (is_a) { // VertexA programs include two programs @@ -142,24 +125,6 @@ u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& co return unique_identifier; } -/// Creates an unspecialized program from code streams -std::string GenerateGLSL(const Device& device, ShaderType shader_type, const ShaderIR& ir, - const std::optional<ShaderIR>& ir_b) { - switch (shader_type) { - case ShaderType::Vertex: - return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr); - case ShaderType::Geometry: - return GLShader::GenerateGeometryShader(device, ir); - case ShaderType::Fragment: - return GLShader::GenerateFragmentShader(device, ir); - case ShaderType::Compute: - return GLShader::GenerateComputeShader(device, ir); - default: - UNIMPLEMENTED_MSG("Unimplemented shader_type={}", static_cast<u32>(shader_type)); - return {}; - } -} - constexpr const char* GetShaderTypeName(ShaderType shader_type) { switch (shader_type) { case ShaderType::Vertex: @@ -195,102 +160,38 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) { return {}; } -std::string GetShaderId(u64 unique_identifier, ShaderType shader_type) { +std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); } -Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(Core::System& system, - ShaderType shader_type) { - if (shader_type == ShaderType::Compute) { - return system.GPU().KeplerCompute(); - } else { - return system.GPU().Maxwell3D(); - } -} - -std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType shader_type) { - return std::make_unique<ConstBufferLocker>(shader_type, - GetConstBufferEngineInterface(system, shader_type)); -} - -void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) { - locker.SetBoundBuffer(usage.bound_buffer); - for (const auto& key : usage.keys) { - const auto [buffer, offset] = key.first; - locker.InsertKey(buffer, offset, key.second); +std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { + const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size}; + const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer, + entry.graphics_info, entry.compute_info}; + const auto registry = std::make_shared<Registry>(entry.type, info); + for (const auto& [address, value] : entry.keys) { + const auto [buffer, offset] = address; + registry->InsertKey(buffer, offset, value); } - for (const auto& [offset, sampler] : usage.bound_samplers) { - locker.InsertBoundSampler(offset, sampler); + for (const auto& [offset, sampler] : entry.bound_samplers) { + registry->InsertBoundSampler(offset, sampler); } - for (const auto& [key, sampler] : usage.bindless_samplers) { + for (const auto& [key, sampler] : entry.bindless_samplers) { const auto [buffer, offset] = key; - locker.InsertBindlessSampler(buffer, offset, sampler); + registry->InsertBindlessSampler(buffer, offset, sampler); } + return registry; } -CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderType shader_type, - const ProgramCode& code, const ProgramCode& code_b, - ConstBufferLocker& locker, const ProgramVariant& variant, - bool hint_retrievable = false) { - LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, shader_type)); - - const bool is_compute = shader_type == ShaderType::Compute; - const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; - const ShaderIR ir(code, main_offset, COMPILER_SETTINGS, locker); - std::optional<ShaderIR> ir_b; - if (!code_b.empty()) { - ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker); - } - - std::string source = fmt::format(R"(// {} -#version 430 core -#extension GL_ARB_separate_shader_objects : enable -)", - GetShaderId(unique_identifier, shader_type)); - if (device.HasShaderBallot()) { - source += "#extension GL_ARB_shader_ballot : require\n"; - } - if (device.HasVertexViewportLayer()) { - source += "#extension GL_ARB_shader_viewport_layer_array : require\n"; - } - if (device.HasImageLoadFormatted()) { - source += "#extension GL_EXT_shader_image_load_formatted : require\n"; - } - if (device.HasWarpIntrinsics()) { - source += "#extension GL_NV_gpu_shader5 : require\n" - "#extension GL_NV_shader_thread_group : require\n" - "#extension GL_NV_shader_thread_shuffle : require\n"; - } - // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations) - // on places where we don't want to. - // Thanks to Ryujinx for finding this workaround. - source += "#pragma optionNV(fastmath off)\n"; - - if (shader_type == ShaderType::Geometry) { - const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode); - source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices); - source += fmt::format("layout ({}) in;\n", glsl_topology); - } - if (shader_type == ShaderType::Compute) { - if (variant.local_memory_size > 0) { - source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n", - Common::AlignUp(variant.local_memory_size, 4) / 4); - } - source += - fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n", - variant.block_x, variant.block_y, variant.block_z); - - if (variant.shared_memory_size > 0) { - // shared_memory_size is described in number of words - source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size); - } - } - - source += '\n'; - source += GenerateGLSL(device, shader_type, ir, ir_b); +std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, + u64 unique_identifier, const ShaderIR& ir, + const Registry& registry, bool hint_retrievable = false) { + const std::string shader_id = MakeShaderID(unique_identifier, shader_type); + LOG_INFO(Render_OpenGL, "{}", shader_id); + const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); OGLShader shader; - shader.Create(source.c_str(), GetGLShaderType(shader_type)); + shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); auto program = std::make_shared<OGLProgram>(); program->Create(true, hint_retrievable, shader.handle); @@ -298,7 +199,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp } std::unordered_set<GLenum> GetSupportedFormats() { - GLint num_formats{}; + GLint num_formats; glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); std::vector<GLint> formats(num_formats); @@ -313,115 +214,82 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type, - GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b) - : RasterizerCacheObject{params.host_ptr}, system{params.system}, - disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr}, - unique_identifier{params.unique_identifier}, shader_type{shader_type}, - entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} { - if (!params.precompiled_variants) { - return; - } - for (const auto& pair : *params.precompiled_variants) { - auto locker = MakeLocker(system, shader_type); - const auto& usage = pair->first; - FillLocker(*locker, usage); - - std::unique_ptr<LockerVariant>* locker_variant = nullptr; - const auto it = - std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) { - return variant->locker->HasEqualKeys(*locker); - }); - if (it == locker_variants.end()) { - locker_variant = &locker_variants.emplace_back(); - *locker_variant = std::make_unique<LockerVariant>(); - locker_variant->get()->locker = std::move(locker); - } else { - locker_variant = &*it; - } - locker_variant->get()->programs.emplace(usage.variant, pair->second); - } +CachedShader::CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes, + std::shared_ptr<VideoCommon::Shader::Registry> registry, + ShaderEntries entries, std::shared_ptr<OGLProgram> program) + : RasterizerCacheObject{host_ptr}, registry{std::move(registry)}, entries{std::move(entries)}, + cpu_addr{cpu_addr}, size_in_bytes{size_in_bytes}, program{std::move(program)} {} + +CachedShader::~CachedShader() = default; + +GLuint CachedShader::GetHandle() const { + DEBUG_ASSERT(registry->IsConsistent()); + return program->handle; } Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code, ProgramCode code_b) { const auto shader_type = GetShaderType(program_type); - params.disk_cache.SaveRaw( - ShaderDiskCacheRaw(params.unique_identifier, shader_type, code, code_b)); + const std::size_t size_in_bytes = code.size() * sizeof(u64); - ConstBufferLocker locker(shader_type, params.system.GPU().Maxwell3D()); - const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker); + auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D()); + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); // TODO(Rodrigo): Handle VertexA shaders // std::optional<ShaderIR> ir_b; // if (!code_b.empty()) { // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); // } - return std::shared_ptr<CachedShader>(new CachedShader( - params, shader_type, GLShader::GetEntries(ir), std::move(code), std::move(code_b))); + auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + + ShaderDiskCacheEntry entry; + entry.type = shader_type; + entry.code = std::move(code); + entry.code_b = std::move(code_b); + entry.unique_identifier = params.unique_identifier; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.graphics_info = registry->GetGraphicsInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr, + size_in_bytes, std::move(registry), + MakeEntries(ir), std::move(program))); } Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { - params.disk_cache.SaveRaw( - ShaderDiskCacheRaw(params.unique_identifier, ShaderType::Compute, code)); - - ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute, - params.system.GPU().KeplerCompute()); - const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker); - return std::shared_ptr<CachedShader>(new CachedShader( - params, ShaderType::Compute, GLShader::GetEntries(ir), std::move(code), {})); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto& engine = params.system.GPU().KeplerCompute(); + auto registry = std::make_shared<Registry>(ShaderType::Compute, engine); + const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + const u64 uid = params.unique_identifier; + auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry); + + ShaderDiskCacheEntry entry; + entry.type = ShaderType::Compute; + entry.code = std::move(code); + entry.unique_identifier = uid; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.compute_info = registry->GetComputeInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr, + size_in_bytes, std::move(registry), + MakeEntries(ir), std::move(program))); } Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const UnspecializedShader& unspecialized) { - return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.type, - unspecialized.entries, unspecialized.code, - unspecialized.code_b)); -} - -GLuint CachedShader::GetHandle(const ProgramVariant& variant) { - EnsureValidLockerVariant(); - - const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant); - auto& program = entry->second; - if (!is_cache_miss) { - return program->handle; - } - - program = BuildShader(device, unique_identifier, shader_type, code, code_b, - *curr_locker_variant->locker, variant); - disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker)); - - LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); - return program->handle; -} - -bool CachedShader::EnsureValidLockerVariant() { - const auto previous_variant = curr_locker_variant; - if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) { - curr_locker_variant = nullptr; - } - if (!curr_locker_variant) { - for (auto& variant : locker_variants) { - if (variant->locker->IsConsistent()) { - curr_locker_variant = variant.get(); - } - } - } - if (!curr_locker_variant) { - auto& new_variant = locker_variants.emplace_back(); - new_variant = std::make_unique<LockerVariant>(); - new_variant->locker = MakeLocker(system, shader_type); - curr_locker_variant = new_variant.get(); - } - return previous_variant == curr_locker_variant; -} - -ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant, - const ConstBufferLocker& locker) const { - return ShaderDiskCacheUsage{unique_identifier, variant, - locker.GetBoundBuffer(), locker.GetKeys(), - locker.GetBoundSamplers(), locker.GetBindlessSamplers()}; + const PrecompiledShader& precompiled_shader, + std::size_t size_in_bytes) { + return std::shared_ptr<CachedShader>(new CachedShader( + params.host_ptr, params.cpu_addr, size_in_bytes, precompiled_shader.registry, + precompiled_shader.entries, precompiled_shader.program)); } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, @@ -431,16 +299,12 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { - const auto transferable = disk_cache.LoadTransferable(); + const std::optional transferable = disk_cache.LoadTransferable(); if (!transferable) { return; } - const auto [raws, shader_usages] = *transferable; - if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) { - return; - } - const auto dumps = disk_cache.LoadPrecompiled(); + const std::vector gl_cache = disk_cache.LoadPrecompiled(); const auto supported_formats = GetSupportedFormats(); // Track if precompiled cache was altered during loading to know if we have to @@ -449,77 +313,82 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, // Inform the frontend about shader build initialization if (callback) { - callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size()); + callback(VideoCore::LoadCallbackStage::Build, 0, transferable->size()); } std::mutex mutex; std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex - std::atomic_bool compilation_failed = false; + std::atomic_bool gl_cache_failed = false; - const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, - std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages, - const ShaderDumpsMap& dumps) { + const auto find_precompiled = [&gl_cache](u64 id) { + return std::find_if(gl_cache.begin(), gl_cache.end(), + [id](const auto& entry) { return entry.unique_identifier == id; }); + }; + + const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, + std::size_t end) { context->MakeCurrent(); SCOPE_EXIT({ return context->DoneCurrent(); }); for (std::size_t i = begin; i < end; ++i) { - if (stop_loading || compilation_failed) { + if (stop_loading) { return; } - const auto& usage{shader_usages[i]}; - const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)}; - const auto dump{dumps.find(usage)}; - - CachedProgram shader; - if (dump != dumps.end()) { - // If the shader is dumped, attempt to load it with - shader = GeneratePrecompiledProgram(dump->second, supported_formats); - if (!shader) { - compilation_failed = true; - return; + const auto& entry = (*transferable)[i]; + const u64 uid = entry.unique_identifier; + const auto it = find_precompiled(uid); + const auto precompiled_entry = it != gl_cache.end() ? &*it : nullptr; + + const bool is_compute = entry.type == ShaderType::Compute; + const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; + auto registry = MakeRegistry(entry); + const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); + + std::shared_ptr<OGLProgram> program; + if (precompiled_entry) { + // If the shader is precompiled, attempt to load it with + program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); + if (!program) { + gl_cache_failed = true; } } - if (!shader) { - auto locker{MakeLocker(system, unspecialized.type)}; - FillLocker(*locker, usage); - - shader = BuildShader(device, usage.unique_identifier, unspecialized.type, - unspecialized.code, unspecialized.code_b, *locker, - usage.variant, true); + if (!program) { + // Otherwise compile it from GLSL + program = BuildShader(device, entry.type, uid, ir, *registry, true); } + PrecompiledShader shader; + shader.program = std::move(program); + shader.registry = std::move(registry); + shader.entries = MakeEntries(ir); + std::scoped_lock lock{mutex}; if (callback) { callback(VideoCore::LoadCallbackStage::Build, ++built_shaders, - shader_usages.size()); + transferable->size()); } - - precompiled_programs.emplace(usage, std::move(shader)); - - // TODO(Rodrigo): Is there a better way to do this? - precompiled_variants[usage.unique_identifier].push_back( - precompiled_programs.find(usage)); + runtime_cache.emplace(entry.unique_identifier, std::move(shader)); } }; const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)}; - const std::size_t bucket_size{shader_usages.size() / num_workers}; + const std::size_t bucket_size{transferable->size() / num_workers}; std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers); std::vector<std::thread> threads(num_workers); for (std::size_t i = 0; i < num_workers; ++i) { const bool is_last_worker = i + 1 == num_workers; const std::size_t start{bucket_size * i}; - const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size}; + const std::size_t end{is_last_worker ? transferable->size() : start + bucket_size}; // On some platforms the shared context has to be created from the GUI thread contexts[i] = emu_window.CreateSharedContext(); - threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps); + threads[i] = std::thread(worker, contexts[i].get(), start, end); } for (auto& thread : threads) { thread.join(); } - if (compilation_failed) { + if (gl_cache_failed) { // Invalidate the precompiled cache if a shader dumped shader was rejected disk_cache.InvalidatePrecompiled(); precompiled_cache_altered = true; @@ -532,11 +401,12 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // before precompiling them - for (std::size_t i = 0; i < shader_usages.size(); ++i) { - const auto& usage{shader_usages[i]}; - if (dumps.find(usage) == dumps.end()) { - const auto& program{precompiled_programs.at(usage)}; - disk_cache.SaveDump(usage, program->handle); + for (std::size_t i = 0; i < transferable->size(); ++i) { + const u64 id = (*transferable)[i].unique_identifier; + const auto it = find_precompiled(id); + if (it == gl_cache.end()) { + const GLuint program = runtime_cache.at(id).program->handle; + disk_cache.SavePrecompiled(id, program); precompiled_cache_altered = true; } } @@ -546,84 +416,33 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } } -const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const { - const auto it = precompiled_variants.find(unique_identifier); - return it == precompiled_variants.end() ? nullptr : &it->second; -} - -CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram( - const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) { - if (supported_formats.find(dump.binary_format) == supported_formats.end()) { - LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing"); +std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( + const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, + const std::unordered_set<GLenum>& supported_formats) { + if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { + LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing"); return {}; } - CachedProgram shader = std::make_shared<OGLProgram>(); - shader->handle = glCreateProgram(); - glProgramParameteri(shader->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); - glProgramBinary(shader->handle, dump.binary_format, dump.binary.data(), - static_cast<GLsizei>(dump.binary.size())); - - GLint link_status{}; - glGetProgramiv(shader->handle, GL_LINK_STATUS, &link_status); + auto program = std::make_shared<OGLProgram>(); + program->handle = glCreateProgram(); + glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); + glProgramBinary(program->handle, precompiled_entry.binary_format, + precompiled_entry.binary.data(), + static_cast<GLsizei>(precompiled_entry.binary.size())); + + GLint link_status; + glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); if (link_status == GL_FALSE) { - LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver - removing"); + LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); return {}; } - return shader; -} - -bool ShaderCacheOpenGL::GenerateUnspecializedShaders( - const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback, - const std::vector<ShaderDiskCacheRaw>& raws) { - if (callback) { - callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); - } - - for (std::size_t i = 0; i < raws.size(); ++i) { - if (stop_loading) { - return false; - } - const auto& raw{raws[i]}; - const u64 unique_identifier{raw.GetUniqueIdentifier()}; - const u64 calculated_hash{ - GetUniqueIdentifier(raw.GetType(), raw.HasProgramA(), raw.GetCode(), raw.GetCodeB())}; - if (unique_identifier != calculated_hash) { - LOG_ERROR(Render_OpenGL, - "Invalid hash in entry={:016x} (obtained hash={:016x}) - " - "removing shader cache", - raw.GetUniqueIdentifier(), calculated_hash); - disk_cache.InvalidateTransferable(); - return false; - } - - const u32 main_offset = - raw.GetType() == ShaderType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; - ConstBufferLocker locker(raw.GetType()); - const ShaderIR ir(raw.GetCode(), main_offset, COMPILER_SETTINGS, locker); - // TODO(Rodrigo): Handle VertexA shaders - // std::optional<ShaderIR> ir_b; - // if (raw.HasProgramA()) { - // ir_b.emplace(raw.GetProgramCodeB(), main_offset); - // } - - UnspecializedShader unspecialized; - unspecialized.entries = GLShader::GetEntries(ir); - unspecialized.type = raw.GetType(); - unspecialized.code = raw.GetCode(); - unspecialized.code_b = raw.GetCodeB(); - unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized); - - if (callback) { - callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); - } - } - return true; + return program; } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty.shaders) { + if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -647,17 +466,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const auto unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); - const auto precompiled_variants = GetPrecompiledVariants(unique_identifier); const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)}; - const ShaderParameters params{system, disk_cache, precompiled_variants, device, + const ShaderParameters params{system, disk_cache, device, cpu_addr, host_ptr, unique_identifier}; - const auto found = unspecialized_shaders.find(unique_identifier); - if (found == unspecialized_shaders.end()) { + const auto found = runtime_cache.find(unique_identifier); + if (found == runtime_cache.end()) { shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); } else { - shader = CachedShader::CreateFromCache(params, found->second); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); } Register(shader); @@ -672,19 +491,19 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { return kernel; } - // No kernel found - create a new one + // No kernel found, create a new one auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; - const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code, {})}; - const auto precompiled_variants = GetPrecompiledVariants(unique_identifier); + const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; - const ShaderParameters params{system, disk_cache, precompiled_variants, device, + const ShaderParameters params{system, disk_cache, device, cpu_addr, host_ptr, unique_identifier}; - const auto found = unspecialized_shaders.find(unique_identifier); - if (found == unspecialized_shaders.end()) { + const auto found = runtime_cache.find(unique_identifier); + if (found == runtime_cache.end()) { kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); } else { - kernel = CachedShader::CreateFromCache(params, found->second); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); } Register(kernel); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 7b1470db3..4935019fc 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -22,7 +22,7 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace Core { @@ -41,22 +41,17 @@ class RasterizerOpenGL; struct UnspecializedShader; using Shader = std::shared_ptr<CachedShader>; -using CachedProgram = std::shared_ptr<OGLProgram>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>; -using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>; - -struct UnspecializedShader { - GLShader::ShaderEntries entries; - Tegra::Engines::ShaderType type; - ProgramCode code; - ProgramCode code_b; + +struct PrecompiledShader { + std::shared_ptr<OGLProgram> program; + std::shared_ptr<VideoCommon::Shader::Registry> registry; + ShaderEntries entries; }; struct ShaderParameters { Core::System& system; ShaderDiskCacheOpenGL& disk_cache; - const PrecompiledVariants* precompiled_variants; const Device& device; VAddr cpu_addr; u8* host_ptr; @@ -65,61 +60,45 @@ struct ShaderParameters { class CachedShader final : public RasterizerCacheObject { public: - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + ~CachedShader(); - static Shader CreateFromCache(const ShaderParameters& params, - const UnspecializedShader& unspecialized); + /// Gets the GL program handle for the shader + GLuint GetHandle() const; + /// Returns the guest CPU address of the shader VAddr GetCpuAddr() const override { return cpu_addr; } + /// Returns the size in bytes of the shader std::size_t GetSizeInBytes() const override { - return code.size() * sizeof(u64); + return size_in_bytes; } /// Gets the shader entries for the shader - const GLShader::ShaderEntries& GetShaderEntries() const { + const ShaderEntries& GetEntries() const { return entries; } - /// Gets the GL program handle for the shader - GLuint GetHandle(const ProgramVariant& variant); - -private: - struct LockerVariant { - std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker; - std::unordered_map<ProgramVariant, CachedProgram> programs; - }; - - explicit CachedShader(const ShaderParameters& params, Tegra::Engines::ShaderType shader_type, - GLShader::ShaderEntries entries, ProgramCode program_code, - ProgramCode program_code_b); - - bool EnsureValidLockerVariant(); - - ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant, - const VideoCommon::Shader::ConstBufferLocker& locker) const; - - Core::System& system; - ShaderDiskCacheOpenGL& disk_cache; - const Device& device; - - VAddr cpu_addr{}; - - u64 unique_identifier{}; - Tegra::Engines::ShaderType shader_type{}; - - GLShader::ShaderEntries entries; + static Shader CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b); + static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); - ProgramCode code; - ProgramCode code_b; + static Shader CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader, + std::size_t size_in_bytes); - LockerVariant* curr_locker_variant = nullptr; - std::vector<std::unique_ptr<LockerVariant>> locker_variants; +private: + explicit CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes, + std::shared_ptr<VideoCommon::Shader::Registry> registry, + ShaderEntries entries, std::shared_ptr<OGLProgram> program); + + std::shared_ptr<VideoCommon::Shader::Registry> registry; + ShaderEntries entries; + VAddr cpu_addr = 0; + std::size_t size_in_bytes = 0; + std::shared_ptr<OGLProgram> program; }; class ShaderCacheOpenGL final : public RasterizerCache<Shader> { @@ -142,25 +121,15 @@ protected: void FlushObjectInner(const Shader& object) override {} private: - bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading, - const VideoCore::DiskResourceLoadCallback& callback, - const std::vector<ShaderDiskCacheRaw>& raws); - - CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, - const std::unordered_set<GLenum>& supported_formats); - - const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const; + std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( + const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, + const std::unordered_set<GLenum>& supported_formats); Core::System& system; Core::Frontend::EmuWindow& emu_window; const Device& device; - ShaderDiskCacheOpenGL disk_cache; - - PrecompiledPrograms precompiled_programs; - std::unordered_map<u64, PrecompiledVariants> precompiled_variants; - - std::unordered_map<u64, UnspecializedShader> unspecialized_shaders; + std::unordered_map<u64, PrecompiledShader> runtime_cache; std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 4735000b5..2c38f57fd 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -23,8 +23,9 @@ #include "video_core/shader/ast.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" -namespace OpenGL::GLShader { +namespace OpenGL { namespace { @@ -36,6 +37,8 @@ using Tegra::Shader::IpaInterpMode; using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::Register; +using VideoCommon::Shader::BuildTransformFeedback; +using VideoCommon::Shader::Registry; using namespace std::string_literals; using namespace VideoCommon::Shader; @@ -48,6 +51,11 @@ class ExprDecompiler; enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; +constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"}; + +constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr"; +constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr"; + struct TextureOffset {}; struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; @@ -56,6 +64,25 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument> constexpr u32 MAX_CONSTBUFFER_ELEMENTS = static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt +#define ftou floatBitsToUint +#define itof intBitsToFloat +#define utof uintBitsToFloat + +bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{ + bvec2 is_nan1 = isnan(pair1); + bvec2 is_nan2 = isnan(pair2); + return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y); +}} + +const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); +const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); + +layout (std140, binding = {}) uniform vs_config {{ + float y_direction; +}}; +)"; + class ShaderWriter final { public: void AddExpression(std::string_view text) { @@ -269,12 +296,41 @@ const char* GetImageTypeDeclaration(Tegra::Shader::ImageType image_type) { } } +/// Describes primitive behavior on geometry shaders +std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology topology) { + switch (topology) { + case Maxwell::PrimitiveTopology::Points: + return {"points", 1}; + case Maxwell::PrimitiveTopology::Lines: + case Maxwell::PrimitiveTopology::LineStrip: + return {"lines", 2}; + case Maxwell::PrimitiveTopology::LinesAdjacency: + case Maxwell::PrimitiveTopology::LineStripAdjacency: + return {"lines_adjacency", 4}; + case Maxwell::PrimitiveTopology::Triangles: + case Maxwell::PrimitiveTopology::TriangleStrip: + case Maxwell::PrimitiveTopology::TriangleFan: + return {"triangles", 3}; + case Maxwell::PrimitiveTopology::TrianglesAdjacency: + case Maxwell::PrimitiveTopology::TriangleStripAdjacency: + return {"triangles_adjacency", 6}; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + return {"points", 1}; + } +} + /// Generates code to use for a swizzle operation. -constexpr const char* GetSwizzle(u32 element) { +constexpr const char* GetSwizzle(std::size_t element) { constexpr std::array swizzle = {".x", ".y", ".z", ".w"}; return swizzle.at(element); } +constexpr const char* GetColorSwizzle(std::size_t element) { + constexpr std::array swizzle = {".r", ".g", ".b", ".a"}; + return swizzle.at(element); +} + /// Translate topology std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { switch (topology) { @@ -337,15 +393,66 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } -[[deprecated]] constexpr bool IsVertexShader(ShaderType stage) { - return stage == ShaderType::Vertex; -} +struct GenericVaryingDescription { + std::string name; + u8 first_element = 0; + bool is_scalar = false; +}; class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderType stage, - std::string suffix) - : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier, std::string_view suffix) + : device{device}, ir{ir}, registry{registry}, stage{stage}, + identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + } + + void Decompile() { + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareImages(); + DeclareSamplers(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareLocalMemory(); + DeclareRegisters(); + DeclarePredicates(); + DeclareInternalFlags(); + DeclareCustomVariables(); + DeclarePhysicalAttributeReader(); + + code.AddLine("void main() {{"); + ++code.scope; + + if (stage == ShaderType::Vertex) { + code.AddLine("gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);"); + } + + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + + --code.scope; + code.AddLine("}}"); + } + + std::string GetResult() { + return code.GetResult(); + } + +private: + friend class ASTDecompiler; + friend class ExprDecompiler; void DecompileBranchMode() { // VM's program counter @@ -387,46 +494,40 @@ public: void DecompileAST(); - void Decompile() { - DeclareVertex(); - DeclareGeometry(); - DeclareRegisters(); - DeclareCustomVariables(); - DeclarePredicates(); - DeclareLocalMemory(); - DeclareInternalFlags(); - DeclareInputAttributes(); - DeclareOutputAttributes(); - DeclareConstantBuffers(); - DeclareGlobalMemory(); - DeclareSamplers(); - DeclareImages(); - DeclarePhysicalAttributeReader(); - - code.AddLine("void execute_{}() {{", suffix); - ++code.scope; - - if (ir.IsDecompiled()) { - DecompileAST(); - } else { - DecompileBranchMode(); + void DeclareHeader() { + if (!identifier.empty()) { + code.AddLine("// {}", identifier); + } + code.AddLine("#version 440 core"); + code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); + if (device.HasShaderBallot()) { + code.AddLine("#extension GL_ARB_shader_ballot : require"); + } + if (device.HasVertexViewportLayer()) { + code.AddLine("#extension GL_ARB_shader_viewport_layer_array : require"); } + if (device.HasImageLoadFormatted()) { + code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); + } + if (device.HasWarpIntrinsics()) { + code.AddLine("#extension GL_NV_gpu_shader5 : require"); + code.AddLine("#extension GL_NV_shader_thread_group : require"); + code.AddLine("#extension GL_NV_shader_thread_shuffle : require"); + } + // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 + // operations) on places where we don't want to. + // Thanks to Ryujinx for finding this workaround. + code.AddLine("#pragma optionNV(fastmath off)"); - --code.scope; - code.AddLine("}}"); - } + code.AddNewLine(); - std::string GetResult() { - return code.GetResult(); + code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); } -private: - friend class ASTDecompiler; - friend class ExprDecompiler; - void DeclareVertex() { - if (!IsVertexShader(stage)) + if (stage != ShaderType::Vertex) { return; + } DeclareVertexRedeclarations(); } @@ -436,9 +537,15 @@ private: return; } + const auto& info = registry.GetGraphicsInfo(); + const auto input_topology = info.primitive_topology; + const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(input_topology); + max_input_vertices = max_vertices; + code.AddLine("layout ({}) in;", glsl_topology); + const auto topology = GetTopologyName(header.common3.output_topology); - const auto max_vertices = header.common4.max_output_vertices.Value(); - code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices); + const auto max_output_vertices = header.common4.max_output_vertices.Value(); + code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_output_vertices); code.AddNewLine(); code.AddLine("in gl_PerVertex {{"); @@ -450,11 +557,40 @@ private: DeclareVertexRedeclarations(); } + void DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { + code.AddLine("layout (location = {}) out vec4 frag_color{};", rt, rt); + } + } + + void DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const auto& info = registry.GetComputeInfo(); + if (const u32 size = info.shared_memory_size_in_words; size > 0) { + code.AddLine("shared uint smem[{}];", size); + code.AddNewLine(); + } + code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;", + info.workgroup_size[0], info.workgroup_size[1], info.workgroup_size[2]); + code.AddNewLine(); + } + void DeclareVertexRedeclarations() { code.AddLine("out gl_PerVertex {{"); ++code.scope; - code.AddLine("vec4 gl_Position;"); + auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position); + if (!pos_xfb.empty()) { + pos_xfb = fmt::format("layout ({}) ", pos_xfb); + } + const char* pos_type = + FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1); + code.AddLine("{}{} gl_Position;", pos_xfb, pos_type); for (const auto attribute : ir.GetOutputAttributes()) { if (attribute == Attribute::Index::ClipDistances0123 || @@ -463,14 +599,14 @@ private: break; } } - if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { + if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } if (ir.UsesViewportIndex()) { code.AddLine("int gl_ViewportIndex;"); } - } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { LOG_ERROR( Render_OpenGL, @@ -525,18 +661,16 @@ private: } void DeclareLocalMemory() { + u64 local_memory_size = 0; if (stage == ShaderType::Compute) { - code.AddLine("#ifdef LOCAL_MEMORY_SIZE"); - code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory()); - code.AddLine("#endif"); - return; + local_memory_size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + local_memory_size = header.GetLocalMemorySize(); } - - const u64 local_memory_size = header.GetLocalMemorySize(); if (local_memory_size == 0) { return; } - const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; + const u64 element_count = Common::AlignUp(local_memory_size, 4) / 4; code.AddLine("uint {}[{}];", GetLocalMemory(), element_count); code.AddNewLine(); } @@ -589,7 +723,7 @@ private: void DeclareInputAttribute(Attribute::Index index, bool skip_unused) { const u32 location{GetGenericAttributeIndex(index)}; - std::string name{GetInputAttribute(index)}; + std::string name{GetGenericInputAttribute(index)}; if (stage == ShaderType::Geometry) { name = "gs_" + name + "[]"; } @@ -626,9 +760,59 @@ private: } } + std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + return it->second.components; + } + + std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + + const VaryingTFB& tfb = it->second; + return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer, + tfb.offset, tfb.stride); + } + void DeclareOutputAttribute(Attribute::Index index) { - const u32 location{GetGenericAttributeIndex(index)}; - code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index)); + static constexpr std::string_view swizzle = "xyzw"; + u8 element = 0; + while (element < 4) { + auto xfb = GetTransformFeedbackDecoration(index, element); + if (!xfb.empty()) { + xfb = fmt::format(", {}", xfb); + } + const std::size_t remainder = 4 - element; + const std::size_t num_components = GetNumComponents(index, element).value_or(remainder); + const char* const type = FLOAT_TYPES.at(num_components - 1); + + const u32 location = GetGenericAttributeIndex(index); + + GenericVaryingDescription description; + description.first_element = static_cast<u8>(element); + description.is_scalar = num_components == 1; + description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME); + if (element != 0 || num_components != 4) { + const std::string_view name_swizzle = swizzle.substr(element, num_components); + description.name = fmt::format("{}_{}", description.name, name_swizzle); + } + for (std::size_t i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(location * 4 + element + i); + varying_description.insert({offset, description}); + } + + code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element, + xfb, type, description.name); + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); + } } void DeclareConstantBuffers() { @@ -925,7 +1109,8 @@ private: // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games // set an 0x80000000 index for those and the shader fails to build. Find out why // this happens and what's its intent. - return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint()); + return fmt::format("gs_{}[{} % {}]", name, Visit(buffer).AsUint(), + max_input_vertices.value()); } return std::string(name); }; @@ -959,7 +1144,7 @@ private: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. - ASSERT(IsVertexShader(stage)); + ASSERT(stage == ShaderType::Vertex); switch (element) { case 2: // Config pack's first value is instance_id. @@ -980,7 +1165,7 @@ private: return {"0", Type::Int}; default: if (IsGenericAttribute(attribute)) { - return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element), + return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element), Type::Float}; } break; @@ -1030,12 +1215,12 @@ private: UNIMPLEMENTED(); return {}; case 1: - if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { return {}; } return {{"gl_Layer", Type::Int}}; case 2: - if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { return {}; } return {{"gl_ViewportIndex", Type::Int}}; @@ -1049,8 +1234,7 @@ private: return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; default: if (IsGenericAttribute(attribute)) { - return { - {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}}; + return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); return {}; @@ -1822,16 +2006,19 @@ private: expr += GetSampler(meta->sampler); expr += ", "; - expr += constructors.at(operation.GetOperandsCount() - 1); + expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += VisitOperand(operation, i).AsInt(); - const std::size_t next = i + 1; - if (next == count) - expr += ')'; - else if (next < count) + if (i > 0) { expr += ", "; + } + expr += VisitOperand(operation, i).AsInt(); + } + if (meta->array) { + expr += ", "; + expr += Visit(meta->array).AsInt(); } + expr += ')'; if (meta->lod && !meta->sampler.IsBuffer()) { expr += ", "; @@ -1945,7 +2132,7 @@ private: // TODO(Subv): Figure out how dual-source blending is configured in the Switch. for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { - code.AddLine("FragColor{}[{}] = {};", render_target, component, + code.AddLine("frag_color{}{} = {};", render_target, GetColorSwizzle(component), SafeGetRegister(current_reg).AsFloat()); ++current_reg; } @@ -2261,27 +2448,34 @@ private: static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { - return GetDeclarationWithSuffix(index, "gpr"); + return AppendSuffix(index, "gpr"); } std::string GetCustomVariable(u32 index) const { - return GetDeclarationWithSuffix(index, "custom_var"); + return AppendSuffix(index, "custom_var"); } std::string GetPredicate(Tegra::Shader::Pred pred) const { - return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred"); + return AppendSuffix(static_cast<u32>(pred), "pred"); } - std::string GetInputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr"); + std::string GetGenericInputAttribute(Attribute::Index attribute) const { + return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME); } - std::string GetOutputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr"); + std::unordered_map<u8, GenericVaryingDescription> varying_description; + + std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const { + const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element); + const auto& description = varying_description.at(offset); + if (description.is_scalar) { + return description.name; + } + return fmt::format("{}[{}]", description.name, element - description.first_element); } std::string GetConstBuffer(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf"); + return AppendSuffix(index, "cbuf"); } std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const { @@ -2294,11 +2488,15 @@ private: } std::string GetConstBufferBlock(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf_block"); + return AppendSuffix(index, "cbuf_block"); } std::string GetLocalMemory() const { - return "lmem_" + suffix; + if (suffix.empty()) { + return "lmem"; + } else { + return "lmem_" + std::string{suffix}; + } } std::string GetInternalFlag(InternalFlag flag) const { @@ -2307,23 +2505,31 @@ private: const auto index = static_cast<u32>(flag); ASSERT(index < static_cast<u32>(InternalFlag::Amount)); - return fmt::format("{}_{}", InternalFlagNames[index], suffix); + if (suffix.empty()) { + return InternalFlagNames[index]; + } else { + return fmt::format("{}_{}", InternalFlagNames[index], suffix); + } } std::string GetSampler(const Sampler& sampler) const { - return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); + return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); } std::string GetImage(const Image& image) const { - return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image"); + return AppendSuffix(static_cast<u32>(image.GetIndex()), "image"); } - std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const { - return fmt::format("{}_{}_{}", name, index, suffix); + std::string AppendSuffix(u32 index, std::string_view name) const { + if (suffix.empty()) { + return fmt::format("{}{}", name, index); + } else { + return fmt::format("{}{}_{}", name, index, suffix); + } } u32 GetNumPhysicalInputAttributes() const { - return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); + return stage == ShaderType::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); } u32 GetNumPhysicalAttributes() const { @@ -2334,17 +2540,31 @@ private: return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); } + bool IsRenderTargetEnabled(u32 render_target) const { + for (u32 component = 0; component < 4; ++component) { + if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { + return true; + } + } + return false; + } + const Device& device; const ShaderIR& ir; + const Registry& registry; const ShaderType stage; - const std::string suffix; + const std::string_view identifier; + const std::string_view suffix; const Header header; + std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; + + std::optional<u32> max_input_vertices; }; -std::string GetFlowVariable(u32 i) { - return fmt::format("flow_var_{}", i); +std::string GetFlowVariable(u32 index) { + return fmt::format("flow_var{}", index); } class ExprDecompiler { @@ -2531,7 +2751,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2547,33 +2767,20 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) { for (const auto& image : ir.GetImages()) { entries.images.emplace_back(image); } - entries.clip_distances = ir.GetClipDistances(); + const auto clip_distances = ir.GetClipDistances(); + for (std::size_t i = 0; i < std::size(clip_distances); ++i) { + entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; + } entries.shader_length = ir.GetLength(); return entries; } -std::string GetCommonDeclarations() { - return R"(#define ftoi floatBitsToInt -#define ftou floatBitsToUint -#define itof intBitsToFloat -#define utof uintBitsToFloat - -bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) { - bvec2 is_nan1 = isnan(pair1); - bvec2 is_nan2 = isnan(pair2); - return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y); -} - -const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); -const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); -)"; -} - -std::string Decompile(const Device& device, const ShaderIR& ir, ShaderType stage, - const std::string& suffix) { - GLSLDecompiler decompiler(device, ir, stage, suffix); +std::string DecompileShader(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier, + std::string_view suffix) { + GLSLDecompiler decompiler(device, ir, registry, stage, identifier, suffix); decompiler.Decompile(); return decompiler.GetResult(); } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 7876f48d6..e7dbd810c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -6,22 +6,18 @@ #include <array> #include <string> +#include <string_view> #include <utility> #include <vector> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" -namespace VideoCommon::Shader { -class ShaderIR; -} - namespace OpenGL { -class Device; -} -namespace OpenGL::GLShader { +class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; using SamplerEntry = VideoCommon::Shader::Sampler; @@ -74,15 +70,15 @@ struct ShaderEntries { std::vector<GlobalMemoryEntry> global_memory_entries; std::vector<SamplerEntry> samplers; std::vector<ImageEntry> images; - std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + u32 clip_distances{}; std::size_t shader_length{}; }; -ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir); - -std::string GetCommonDeclarations(); +ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); -std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, - Tegra::Engines::ShaderType stage, const std::string& suffix); +std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier, + std::string_view suffix = {}); -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 1fc204f6f..9e95a122b 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -31,32 +31,24 @@ namespace { using ShaderCacheVersionHash = std::array<u8, 64>; -enum class TransferableEntryKind : u32 { - Raw, - Usage, -}; - struct ConstBufferKey { - u32 cbuf{}; - u32 offset{}; - u32 value{}; + u32 cbuf = 0; + u32 offset = 0; + u32 value = 0; }; struct BoundSamplerKey { - u32 offset{}; - Tegra::Engines::SamplerDescriptor sampler{}; + u32 offset = 0; + Tegra::Engines::SamplerDescriptor sampler; }; struct BindlessSamplerKey { - u32 cbuf{}; - u32 offset{}; - Tegra::Engines::SamplerDescriptor sampler{}; + u32 cbuf = 0; + u32 offset = 0; + Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 12; - -// Making sure sizes doesn't change by accident -static_assert(sizeof(ProgramVariant) == 20); +constexpr u32 NativeVersion = 20; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -67,61 +59,124 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { } // Anonymous namespace -ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ShaderType type, ProgramCode code, - ProgramCode code_b) - : unique_identifier{unique_identifier}, type{type}, code{std::move(code)}, code_b{std::move( - code_b)} {} +ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default; -ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default; +ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default; -ShaderDiskCacheRaw::~ShaderDiskCacheRaw() = default; - -bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) { - if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) || - file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { +bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { + if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { return false; } - u32 code_size{}; - u32 code_size_b{}; + u32 code_size; + u32 code_size_b; if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) || file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) { return false; } - code.resize(code_size); code_b.resize(code_size_b); - if (file.ReadArray(code.data(), code_size) != code_size) + if (file.ReadArray(code.data(), code_size) != code_size) { return false; - + } if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) { return false; } + + u8 is_texture_handler_size_known; + u32 texture_handler_size_value; + u32 num_keys; + u32 num_bound_samplers; + u32 num_bindless_samplers; + if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || + file.ReadArray(&is_texture_handler_size_known, 1) != 1 || + file.ReadArray(&texture_handler_size_value, 1) != 1 || + file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || + file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_bindless_samplers, 1) != 1) { + return false; + } + if (is_texture_handler_size_known) { + texture_handler_size = texture_handler_size_value; + } + + std::vector<ConstBufferKey> flat_keys(num_keys); + std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); + std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || + file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != + flat_bound_samplers.size() || + file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != + flat_bindless_samplers.size()) { + return false; + } + for (const auto& key : flat_keys) { + keys.insert({{key.cbuf, key.offset}, key.value}); + } + for (const auto& key : flat_bound_samplers) { + bound_samplers.emplace(key.offset, key.sampler); + } + for (const auto& key : flat_bindless_samplers) { + bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + } + return true; } -bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const { - if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(static_cast<u32>(type)) != 1 || +bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { + if (file.WriteObject(static_cast<u32>(type)) != 1 || file.WriteObject(static_cast<u32>(code.size())) != 1 || file.WriteObject(static_cast<u32>(code_b.size())) != 1) { return false; } - - if (file.WriteArray(code.data(), code.size()) != code.size()) + if (file.WriteArray(code.data(), code.size()) != code.size()) { return false; - + } if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) { return false; } - return true; + + if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(bound_buffer) != 1 || + file.WriteObject(static_cast<u8>(texture_handler_size.has_value())) != 1 || + file.WriteObject(texture_handler_size.value_or(0)) != 1 || + file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || + file.WriteObject(static_cast<u32>(keys.size())) != 1 || + file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { + return false; + } + + std::vector<ConstBufferKey> flat_keys; + flat_keys.reserve(keys.size()); + for (const auto& [address, value] : keys) { + flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); + } + + std::vector<BoundSamplerKey> flat_bound_samplers; + flat_bound_samplers.reserve(bound_samplers.size()); + for (const auto& [address, sampler] : bound_samplers) { + flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + } + + std::vector<BindlessSamplerKey> flat_bindless_samplers; + flat_bindless_samplers.reserve(bindless_samplers.size()); + for (const auto& [address, sampler] : bindless_samplers) { + flat_bindless_samplers.push_back( + BindlessSamplerKey{address.first, address.second, sampler}); + } + + return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && + file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == + flat_bound_samplers.size() && + file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == + flat_bindless_samplers.size(); } ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; -std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> -ShaderDiskCacheOpenGL::LoadTransferable() { +std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() { // Skip games without title id const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; if (!Settings::values.use_disk_shader_cache || !has_title_id) { @@ -130,17 +185,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() { FileUtil::IOFile file(GetTransferablePath(), "rb"); if (!file.IsOpen()) { - LOG_INFO(Render_OpenGL, "No transferable shader cache found for game with title id={}", - GetTitleID()); + LOG_INFO(Render_OpenGL, "No transferable shader cache found"); is_usable = true; return {}; } u32 version{}; if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { - LOG_ERROR(Render_OpenGL, - "Failed to get transferable cache version for title id={}, skipping", - GetTitleID()); + LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it"); return {}; } @@ -158,105 +210,42 @@ ShaderDiskCacheOpenGL::LoadTransferable() { } // Version is valid, load the shaders - constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping"; - std::vector<ShaderDiskCacheRaw> raws; - std::vector<ShaderDiskCacheUsage> usages; + std::vector<ShaderDiskCacheEntry> entries; while (file.Tell() < file.GetSize()) { - TransferableEntryKind kind{}; - if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) { - LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping"); - return {}; - } - - switch (kind) { - case TransferableEntryKind::Raw: { - ShaderDiskCacheRaw entry; - if (!entry.Load(file)) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - transferable.insert({entry.GetUniqueIdentifier(), {}}); - raws.push_back(std::move(entry)); - break; - } - case TransferableEntryKind::Usage: { - ShaderDiskCacheUsage usage; - - u32 num_keys{}; - u32 num_bound_samplers{}; - u32 num_bindless_samplers{}; - if (file.ReadArray(&usage.unique_identifier, 1) != 1 || - file.ReadArray(&usage.variant, 1) != 1 || - file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || - file.ReadArray(&num_bound_samplers, 1) != 1 || - file.ReadArray(&num_bindless_samplers, 1) != 1) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - - std::vector<ConstBufferKey> keys(num_keys); - std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers); - if (file.ReadArray(keys.data(), keys.size()) != keys.size() || - file.ReadArray(bound_samplers.data(), bound_samplers.size()) != - bound_samplers.size() || - file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) != - bindless_samplers.size()) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - for (const auto& key : keys) { - usage.keys.insert({{key.cbuf, key.offset}, key.value}); - } - for (const auto& key : bound_samplers) { - usage.bound_samplers.emplace(key.offset, key.sampler); - } - for (const auto& key : bindless_samplers) { - usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); - } - - usages.push_back(std::move(usage)); - break; - } - default: - LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping", - static_cast<u32>(kind)); + ShaderDiskCacheEntry& entry = entries.emplace_back(); + if (!entry.Load(file)) { + LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping"); return {}; } } is_usable = true; - return {{std::move(raws), std::move(usages)}}; + return {std::move(entries)}; } -std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> -ShaderDiskCacheOpenGL::LoadPrecompiled() { +std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() { if (!is_usable) { return {}; } - std::string path = GetPrecompiledPath(); - FileUtil::IOFile file(path, "rb"); + FileUtil::IOFile file(GetPrecompiledPath(), "rb"); if (!file.IsOpen()) { - LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}", - GetTitleID()); + LOG_INFO(Render_OpenGL, "No precompiled shader cache found"); return {}; } - const auto result = LoadPrecompiledFile(file); - if (!result) { - LOG_INFO(Render_OpenGL, - "Failed to load precompiled cache for game with title id={}, removing", - GetTitleID()); - file.Close(); - InvalidatePrecompiled(); - return {}; + if (const auto result = LoadPrecompiledFile(file)) { + return *result; } - return *result; + + LOG_INFO(Render_OpenGL, "Failed to load precompiled cache"); + file.Close(); + InvalidatePrecompiled(); + return {}; } -std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> -ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { +std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile( + FileUtil::IOFile& file) { // Read compressed file from disk and decompress to virtual precompiled cache file std::vector<u8> compressed(file.GetSize()); file.ReadBytes(compressed.data(), compressed.size()); @@ -275,58 +264,22 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { return {}; } - ShaderDumpsMap dumps; + std::vector<ShaderDiskCachePrecompiled> entries; while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) { - u32 num_keys{}; - u32 num_bound_samplers{}; - u32 num_bindless_samplers{}; - ShaderDiskCacheUsage usage; - if (!LoadObjectFromPrecompiled(usage.unique_identifier) || - !LoadObjectFromPrecompiled(usage.variant) || - !LoadObjectFromPrecompiled(usage.bound_buffer) || - !LoadObjectFromPrecompiled(num_keys) || - !LoadObjectFromPrecompiled(num_bound_samplers) || - !LoadObjectFromPrecompiled(num_bindless_samplers)) { - return {}; - } - std::vector<ConstBufferKey> keys(num_keys); - std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers); - if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) || - !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) != - bound_samplers.size() || - !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) != - bindless_samplers.size()) { - return {}; - } - for (const auto& key : keys) { - usage.keys.insert({{key.cbuf, key.offset}, key.value}); - } - for (const auto& key : bound_samplers) { - usage.bound_samplers.emplace(key.offset, key.sampler); - } - for (const auto& key : bindless_samplers) { - usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); - } - - ShaderDiskCacheDump dump; - if (!LoadObjectFromPrecompiled(dump.binary_format)) { - return {}; - } - - u32 binary_length{}; - if (!LoadObjectFromPrecompiled(binary_length)) { + u32 binary_size; + auto& entry = entries.emplace_back(); + if (!LoadObjectFromPrecompiled(entry.unique_identifier) || + !LoadObjectFromPrecompiled(entry.binary_format) || + !LoadObjectFromPrecompiled(binary_size)) { return {}; } - dump.binary.resize(binary_length); - if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) { + entry.binary.resize(binary_size); + if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) { return {}; } - - dumps.emplace(std::move(usage), dump); } - return dumps; + return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { @@ -346,13 +299,13 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() { } } -void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) { +void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { if (!is_usable) { return; } - const u64 id = entry.GetUniqueIdentifier(); - if (transferable.find(id) != transferable.end()) { + const u64 id = entry.unique_identifier; + if (stored_transferable.find(id) != stored_transferable.end()) { // The shader already exists return; } @@ -361,71 +314,17 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) { if (!file.IsOpen()) { return; } - if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) { + if (!entry.Save(file)) { LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing"); file.Close(); InvalidateTransferable(); return; } - transferable.insert({id, {}}); -} -void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) { - if (!is_usable) { - return; - } - - const auto it = transferable.find(usage.unique_identifier); - ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously"); - - auto& usages{it->second}; - if (usages.find(usage) != usages.end()) { - // Skip this variant since the shader is already stored. - return; - } - usages.insert(usage); - - FileUtil::IOFile file = AppendTransferableFile(); - if (!file.IsOpen()) - return; - const auto Close = [&] { - LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing"); - file.Close(); - InvalidateTransferable(); - }; - - if (file.WriteObject(TransferableEntryKind::Usage) != 1 || - file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 || - file.WriteObject(usage.bound_buffer) != 1 || - file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 || - file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 || - file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) { - Close(); - return; - } - for (const auto& [pair, value] : usage.keys) { - const auto [cbuf, offset] = pair; - if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) { - Close(); - return; - } - } - for (const auto& [offset, sampler] : usage.bound_samplers) { - if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) { - Close(); - return; - } - } - for (const auto& [pair, sampler] : usage.bindless_samplers) { - const auto [cbuf, offset] = pair; - if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) { - Close(); - return; - } - } + stored_transferable.insert(id); } -void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint program) { +void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint program) { if (!is_usable) { return; } @@ -437,51 +336,19 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p SavePrecompiledHeaderToVirtualPrecompiledCache(); } - GLint binary_length{}; + GLint binary_length; glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length); - GLenum binary_format{}; + GLenum binary_format; std::vector<u8> binary(binary_length); glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); - const auto Close = [&] { + if (!SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(binary_format) || + !SaveObjectToPrecompiled(static_cast<u32>(binary.size())) || + !SaveArrayToPrecompiled(binary.data(), binary.size())) { LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing", - usage.unique_identifier); + unique_identifier); InvalidatePrecompiled(); - }; - - if (!SaveObjectToPrecompiled(usage.unique_identifier) || - !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) { - Close(); - return; - } - for (const auto& [pair, value] : usage.keys) { - const auto [cbuf, offset] = pair; - if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) { - Close(); - return; - } - } - for (const auto& [offset, sampler] : usage.bound_samplers) { - if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) { - Close(); - return; - } - } - for (const auto& [pair, sampler] : usage.bindless_samplers) { - const auto [cbuf, offset] = pair; - if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) { - Close(); - return; - } - } - if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) || - !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) || - !SaveArrayToPrecompiled(binary.data(), binary.size())) { - Close(); } } @@ -534,7 +401,6 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) { LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", precompiled_path); - return; } } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index ef2371f6d..d5be52e40 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -19,8 +19,7 @@ #include "common/common_types.h" #include "core/file_sys/vfs_vector.h" #include "video_core/engines/shader_type.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" namespace Core { class System; @@ -32,139 +31,39 @@ class IOFile; namespace OpenGL { -struct ShaderDiskCacheUsage; -struct ShaderDiskCacheDump; - using ProgramCode = std::vector<u64>; -using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; - -/// Describes the different variants a program can be compiled with. -struct ProgramVariant final { - ProgramVariant() = default; - - /// Graphics constructor. - explicit constexpr ProgramVariant(GLenum primitive_mode) noexcept - : primitive_mode{primitive_mode} {} - - /// Compute constructor. - explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size, - u32 local_memory_size) noexcept - : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)}, - shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {} - - // Graphics specific parameters. - GLenum primitive_mode{}; - - // Compute specific parameters. - u32 block_x{}; - u16 block_y{}; - u16 block_z{}; - u32 shared_memory_size{}; - u32 local_memory_size{}; - - bool operator==(const ProgramVariant& rhs) const noexcept { - return std::tie(primitive_mode, block_x, block_y, block_z, shared_memory_size, - local_memory_size) == std::tie(rhs.primitive_mode, rhs.block_x, rhs.block_y, - rhs.block_z, rhs.shared_memory_size, - rhs.local_memory_size); - } - - bool operator!=(const ProgramVariant& rhs) const noexcept { - return !operator==(rhs); - } -}; -static_assert(std::is_trivially_copyable_v<ProgramVariant>); - -/// Describes how a shader is used. -struct ShaderDiskCacheUsage { - u64 unique_identifier{}; - ProgramVariant variant; - u32 bound_buffer{}; - VideoCommon::Shader::KeyMap keys; - VideoCommon::Shader::BoundSamplerMap bound_samplers; - VideoCommon::Shader::BindlessSamplerMap bindless_samplers; - - bool operator==(const ShaderDiskCacheUsage& rhs) const { - return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) == - std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers, - rhs.bindless_samplers); - } - - bool operator!=(const ShaderDiskCacheUsage& rhs) const { - return !operator==(rhs); - } -}; - -} // namespace OpenGL - -namespace std { - -template <> -struct hash<OpenGL::ProgramVariant> { - std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept { - return (static_cast<std::size_t>(variant.primitive_mode) << 6) ^ - static_cast<std::size_t>(variant.block_x) ^ - (static_cast<std::size_t>(variant.block_y) << 32) ^ - (static_cast<std::size_t>(variant.block_z) << 48) ^ - (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^ - (static_cast<std::size_t>(variant.local_memory_size) << 36); - } -}; - -template <> -struct hash<OpenGL::ShaderDiskCacheUsage> { - std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept { - return static_cast<std::size_t>(usage.unique_identifier) ^ - std::hash<OpenGL::ProgramVariant>{}(usage.variant); - } -}; - -} // namespace std - -namespace OpenGL { -/// Describes a shader how it's used by the guest GPU -class ShaderDiskCacheRaw { -public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, Tegra::Engines::ShaderType type, - ProgramCode code, ProgramCode code_b = {}); - ShaderDiskCacheRaw(); - ~ShaderDiskCacheRaw(); +/// Describes a shader and how it's used by the guest GPU +struct ShaderDiskCacheEntry { + ShaderDiskCacheEntry(); + ~ShaderDiskCacheEntry(); bool Load(FileUtil::IOFile& file); bool Save(FileUtil::IOFile& file) const; - u64 GetUniqueIdentifier() const { - return unique_identifier; - } - bool HasProgramA() const { return !code.empty() && !code_b.empty(); } - Tegra::Engines::ShaderType GetType() const { - return type; - } - - const ProgramCode& GetCode() const { - return code; - } - - const ProgramCode& GetCodeB() const { - return code_b; - } - -private: - u64 unique_identifier{}; Tegra::Engines::ShaderType type{}; ProgramCode code; ProgramCode code_b; + + u64 unique_identifier = 0; + std::optional<u32> texture_handler_size; + u32 bound_buffer = 0; + VideoCommon::Shader::GraphicsInfo graphics_info; + VideoCommon::Shader::ComputeInfo compute_info; + VideoCommon::Shader::KeyMap keys; + VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; /// Contains an OpenGL dumped binary program -struct ShaderDiskCacheDump { - GLenum binary_format{}; +struct ShaderDiskCachePrecompiled { + u64 unique_identifier = 0; + GLenum binary_format = 0; std::vector<u8> binary; }; @@ -174,11 +73,10 @@ public: ~ShaderDiskCacheOpenGL(); /// Loads transferable cache. If file has a old version or on failure, it deletes the file. - std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> - LoadTransferable(); + std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable(); /// Loads current game's precompiled cache. Invalidates on failure. - std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled(); + std::vector<ShaderDiskCachePrecompiled> LoadPrecompiled(); /// Removes the transferable (and precompiled) cache file. void InvalidateTransferable(); @@ -187,21 +85,18 @@ public: void InvalidatePrecompiled(); /// Saves a raw dump to the transferable file. Checks for collisions. - void SaveRaw(const ShaderDiskCacheRaw& entry); - - /// Saves shader usage to the transferable file. Does not check for collisions. - void SaveUsage(const ShaderDiskCacheUsage& usage); + void SaveEntry(const ShaderDiskCacheEntry& entry); /// Saves a dump entry to the precompiled file. Does not check for collisions. - void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program); + void SavePrecompiled(u64 unique_identifier, GLuint program); /// Serializes virtual precompiled shader cache file to real file void SaveVirtualPrecompiledFile(); private: /// Loads the transferable cache. Returns empty on failure. - std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> - LoadPrecompiledFile(FileUtil::IOFile& file); + std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile( + FileUtil::IOFile& file); /// Opens current game's transferable file and write it's header if it doesn't exist FileUtil::IOFile AppendTransferableFile() const; @@ -270,7 +165,7 @@ private: std::size_t precompiled_cache_virtual_file_offset = 0; // Stored transferable shaders - std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; + std::unordered_set<u64> stored_transferable; // The cache has been loaded at boot bool is_usable{}; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp deleted file mode 100644 index 34946fb47..000000000 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <string> - -#include <fmt/format.h> - -#include "video_core/engines/maxwell_3d.h" -#include "video_core/engines/shader_type.h" -#include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/shader/shader_ir.h" - -namespace OpenGL::GLShader { - -using Tegra::Engines::Maxwell3D; -using Tegra::Engines::ShaderType; -using VideoCommon::Shader::CompileDepth; -using VideoCommon::Shader::CompilerSettings; -using VideoCommon::Shader::ProgramCode; -using VideoCommon::Shader::ShaderIR; - -std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (std140, binding = {}) uniform vs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Vertex, "vertex"); - if (ir_b) { - out += Decompile(device, *ir_b, ShaderType::Vertex, "vertex_b"); - } - - out += R"( -void main() { - gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f); - execute_vertex(); -)"; - if (ir_b) { - out += " execute_vertex_b();"; - } - out += "}\n"; - return out; -} - -std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (std140, binding = {}) uniform gs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Geometry, "geometry"); - - out += R"( -void main() { - execute_geometry(); -} -)"; - return out; -} - -std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (location = 0) out vec4 FragColor0; -layout (location = 1) out vec4 FragColor1; -layout (location = 2) out vec4 FragColor2; -layout (location = 3) out vec4 FragColor3; -layout (location = 4) out vec4 FragColor4; -layout (location = 5) out vec4 FragColor5; -layout (location = 6) out vec4 FragColor6; -layout (location = 7) out vec4 FragColor7; - -layout (std140, binding = {}) uniform fs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Fragment, "fragment"); - - out += R"( -void main() { - execute_fragment(); -} -)"; - return out; -} - -std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += Decompile(device, ir, ShaderType::Compute, "compute"); - out += R"( -void main() { - execute_compute(); -} -)"; - return out; -} - -} // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h deleted file mode 100644 index cba2be9f9..000000000 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <vector> - -#include "common/common_types.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/shader/shader_ir.h" - -namespace OpenGL { -class Device; -} - -namespace OpenGL::GLShader { - -using VideoCommon::Shader::ProgramCode; -using VideoCommon::Shader::ShaderIR; - -/// Generates the GLSL vertex shader program source code for the given VS program -std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b); - -/// Generates the GLSL geometry shader program source code for the given GS program -std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir); - -/// Generates the GLSL fragment shader program source code for the given FS program -std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir); - -/// Generates the GLSL compute shader program source code for the given CS program -std::string GenerateComputeShader(const Device& device, const ShaderIR& ir); - -} // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 75d3fac04..9c7b0adbd 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -2,45 +2,52 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <glad/glad.h> + #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_manager.h" namespace OpenGL::GLShader { -using Tegra::Engines::Maxwell3D; - -ProgramManager::ProgramManager() { - pipeline.Create(); -} +ProgramManager::ProgramManager() = default; ProgramManager::~ProgramManager() = default; -void ProgramManager::ApplyTo(OpenGLState& state) { - UpdatePipeline(); - state.draw.shader_program = 0; - state.draw.program_pipeline = pipeline.handle; +void ProgramManager::Create() { + graphics_pipeline.Create(); + glBindProgramPipeline(graphics_pipeline.handle); } -void ProgramManager::UpdatePipeline() { +void ProgramManager::BindGraphicsPipeline() { + if (!is_graphics_bound) { + is_graphics_bound = true; + glUseProgram(0); + } + // Avoid updating the pipeline when values have no changed if (old_state == current_state) { return; } // Workaround for AMD bug - constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | - GL_FRAGMENT_SHADER_BIT}; - glUseProgramStages(pipeline.handle, all_used_stages, 0); - - glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); - glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); - glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); + static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | + GL_FRAGMENT_SHADER_BIT}; + const GLuint handle = graphics_pipeline.handle; + glUseProgramStages(handle, all_used_stages, 0); + glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); + glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); + glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); old_state = current_state; } -void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell) { +void ProgramManager::BindComputeShader(GLuint program) { + is_graphics_bound = false; + glUseProgram(program); +} + +void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { const auto& regs = maxwell.regs; // Y_NEGATE controls what value S2R returns for the Y_DIRECTION system value. diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 478c165ce..d2e47f2a9 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -9,7 +9,6 @@ #include <glad/glad.h> #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" namespace OpenGL::GLShader { @@ -32,49 +31,47 @@ public: explicit ProgramManager(); ~ProgramManager(); - void ApplyTo(OpenGLState& state); + void Create(); - void UseProgrammableVertexShader(GLuint program) { + /// Updates the graphics pipeline and binds it. + void BindGraphicsPipeline(); + + /// Binds a compute shader. + void BindComputeShader(GLuint program); + + void UseVertexShader(GLuint program) { current_state.vertex_shader = program; } - void UseProgrammableGeometryShader(GLuint program) { + void UseGeometryShader(GLuint program) { current_state.geometry_shader = program; } - void UseProgrammableFragmentShader(GLuint program) { + void UseFragmentShader(GLuint program) { current_state.fragment_shader = program; } - void UseTrivialGeometryShader() { - current_state.geometry_shader = 0; - } - - void UseTrivialFragmentShader() { - current_state.fragment_shader = 0; - } - private: struct PipelineState { - bool operator==(const PipelineState& rhs) const { + bool operator==(const PipelineState& rhs) const noexcept { return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && geometry_shader == rhs.geometry_shader; } - bool operator!=(const PipelineState& rhs) const { + bool operator!=(const PipelineState& rhs) const noexcept { return !operator==(rhs); } - GLuint vertex_shader{}; - GLuint fragment_shader{}; - GLuint geometry_shader{}; + GLuint vertex_shader = 0; + GLuint fragment_shader = 0; + GLuint geometry_shader = 0; }; - void UpdatePipeline(); - - OGLPipeline pipeline; + OGLPipeline graphics_pipeline; + OGLPipeline compute_pipeline; PipelineState current_state; PipelineState old_state; + bool is_graphics_bound = true; }; } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp deleted file mode 100644 index 7d3bc1a1f..000000000 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ /dev/null @@ -1,569 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <iterator> -#include <glad/glad.h> -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_state.h" - -MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128)); - -namespace OpenGL { - -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - -OpenGLState OpenGLState::cur_state; - -namespace { - -template <typename T> -bool UpdateValue(T& current_value, const T new_value) { - const bool changed = current_value != new_value; - current_value = new_value; - return changed; -} - -template <typename T1, typename T2> -bool UpdateTie(T1 current_value, const T2 new_value) { - const bool changed = current_value != new_value; - current_value = new_value; - return changed; -} - -template <typename T> -std::optional<std::pair<GLuint, GLsizei>> UpdateArray(T& current_values, const T& new_values) { - std::optional<std::size_t> first; - std::size_t last; - for (std::size_t i = 0; i < std::size(current_values); ++i) { - if (!UpdateValue(current_values[i], new_values[i])) { - continue; - } - if (!first) { - first = i; - } - last = i; - } - if (!first) { - return std::nullopt; - } - return std::make_pair(static_cast<GLuint>(*first), static_cast<GLsizei>(last - *first + 1)); -} - -void Enable(GLenum cap, bool enable) { - if (enable) { - glEnable(cap); - } else { - glDisable(cap); - } -} - -void Enable(GLenum cap, GLuint index, bool enable) { - if (enable) { - glEnablei(cap, index); - } else { - glDisablei(cap, index); - } -} - -void Enable(GLenum cap, bool& current_value, bool new_value) { - if (UpdateValue(current_value, new_value)) { - Enable(cap, new_value); - } -} - -void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) { - if (UpdateValue(current_value, new_value)) { - Enable(cap, index, new_value); - } -} - -} // Anonymous namespace - -OpenGLState::OpenGLState() = default; - -void OpenGLState::SetDefaultViewports() { - viewports.fill(Viewport{}); - - depth_clamp.far_plane = false; - depth_clamp.near_plane = false; -} - -void OpenGLState::ApplyFramebufferState() { - if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) { - glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer); - } - if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer); - } -} - -void OpenGLState::ApplyVertexArrayState() { - if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) { - glBindVertexArray(draw.vertex_array); - } -} - -void OpenGLState::ApplyShaderProgram() { - if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) { - glUseProgram(draw.shader_program); - } -} - -void OpenGLState::ApplyProgramPipeline() { - if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) { - glBindProgramPipeline(draw.program_pipeline); - } -} - -void OpenGLState::ApplyClipDistances() { - for (std::size_t i = 0; i < clip_distance.size(); ++i) { - Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i], - clip_distance[i]); - } -} - -void OpenGLState::ApplyPointSize() { - Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); - Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite); - if (UpdateValue(cur_state.point.size, point.size)) { - glPointSize(point.size); - } -} - -void OpenGLState::ApplyFragmentColorClamp() { - if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) { - glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB, - fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE); - } -} - -void OpenGLState::ApplyMultisample() { - Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage, - multisample_control.alpha_to_coverage); - Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one, - multisample_control.alpha_to_one); -} - -void OpenGLState::ApplyDepthClamp() { - if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane && - depth_clamp.near_plane == cur_state.depth_clamp.near_plane) { - return; - } - cur_state.depth_clamp = depth_clamp; - - UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane, - "Unimplemented Depth Clamp Separation!"); - - Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane); -} - -void OpenGLState::ApplySRgb() { - if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled) - return; - cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled; - if (framebuffer_srgb.enabled) { - glEnable(GL_FRAMEBUFFER_SRGB); - } else { - glDisable(GL_FRAMEBUFFER_SRGB); - } -} - -void OpenGLState::ApplyCulling() { - Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled); - - if (UpdateValue(cur_state.cull.mode, cull.mode)) { - glCullFace(cull.mode); - } - - if (UpdateValue(cur_state.cull.front_face, cull.front_face)) { - glFrontFace(cull.front_face); - } -} - -void OpenGLState::ApplyRasterizerDiscard() { - Enable(GL_RASTERIZER_DISCARD, cur_state.rasterizer_discard, rasterizer_discard); -} - -void OpenGLState::ApplyColorMask() { - if (!dirty.color_mask) { - return; - } - dirty.color_mask = false; - - for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { - const auto& updated = color_mask[i]; - auto& current = cur_state.color_mask[i]; - if (updated.red_enabled != current.red_enabled || - updated.green_enabled != current.green_enabled || - updated.blue_enabled != current.blue_enabled || - updated.alpha_enabled != current.alpha_enabled) { - current = updated; - glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled, - updated.blue_enabled, updated.alpha_enabled); - } - } -} - -void OpenGLState::ApplyDepth() { - Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled); - - if (cur_state.depth.test_func != depth.test_func) { - cur_state.depth.test_func = depth.test_func; - glDepthFunc(depth.test_func); - } - - if (cur_state.depth.write_mask != depth.write_mask) { - cur_state.depth.write_mask = depth.write_mask; - glDepthMask(depth.write_mask); - } -} - -void OpenGLState::ApplyPrimitiveRestart() { - Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled); - - if (cur_state.primitive_restart.index != primitive_restart.index) { - cur_state.primitive_restart.index = primitive_restart.index; - glPrimitiveRestartIndex(primitive_restart.index); - } -} - -void OpenGLState::ApplyStencilTest() { - if (!dirty.stencil_state) { - return; - } - dirty.stencil_state = false; - - Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled); - - const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) { - if (current.test_func != config.test_func || current.test_ref != config.test_ref || - current.test_mask != config.test_mask) { - current.test_func = config.test_func; - current.test_ref = config.test_ref; - current.test_mask = config.test_mask; - glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask); - } - if (current.action_depth_fail != config.action_depth_fail || - current.action_depth_pass != config.action_depth_pass || - current.action_stencil_fail != config.action_stencil_fail) { - current.action_depth_fail = config.action_depth_fail; - current.action_depth_pass = config.action_depth_pass; - current.action_stencil_fail = config.action_stencil_fail; - glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail, - config.action_depth_pass); - } - if (current.write_mask != config.write_mask) { - current.write_mask = config.write_mask; - glStencilMaskSeparate(face, config.write_mask); - } - }; - ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front); - ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back); -} - -void OpenGLState::ApplyViewport() { - for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) { - const auto& updated = viewports[i]; - auto& current = cur_state.viewports[i]; - - if (current.x != updated.x || current.y != updated.y || current.width != updated.width || - current.height != updated.height) { - current.x = updated.x; - current.y = updated.y; - current.width = updated.width; - current.height = updated.height; - glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y), - static_cast<GLfloat>(updated.width), - static_cast<GLfloat>(updated.height)); - } - if (current.depth_range_near != updated.depth_range_near || - current.depth_range_far != updated.depth_range_far) { - current.depth_range_near = updated.depth_range_near; - current.depth_range_far = updated.depth_range_far; - glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far); - } - - Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled); - - if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y || - current.scissor.width != updated.scissor.width || - current.scissor.height != updated.scissor.height) { - current.scissor.x = updated.scissor.x; - current.scissor.y = updated.scissor.y; - current.scissor.width = updated.scissor.width; - current.scissor.height = updated.scissor.height; - glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width, - updated.scissor.height); - } - } -} - -void OpenGLState::ApplyGlobalBlending() { - const Blend& updated = blend[0]; - Blend& current = cur_state.blend[0]; - - Enable(GL_BLEND, current.enabled, updated.enabled); - - if (current.src_rgb_func != updated.src_rgb_func || - current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func || - current.dst_a_func != updated.dst_a_func) { - current.src_rgb_func = updated.src_rgb_func; - current.dst_rgb_func = updated.dst_rgb_func; - current.src_a_func = updated.src_a_func; - current.dst_a_func = updated.dst_a_func; - glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, - updated.dst_a_func); - } - - if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) { - current.rgb_equation = updated.rgb_equation; - current.a_equation = updated.a_equation; - glBlendEquationSeparate(updated.rgb_equation, updated.a_equation); - } -} - -void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) { - const Blend& updated = blend[target]; - Blend& current = cur_state.blend[target]; - - if (current.enabled != updated.enabled || force) { - current.enabled = updated.enabled; - Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled); - } - - if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func, - current.dst_a_func), - std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, - updated.dst_a_func))) { - glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func, - updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func); - } - - if (UpdateTie(std::tie(current.rgb_equation, current.a_equation), - std::tie(updated.rgb_equation, updated.a_equation))) { - glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation, - updated.a_equation); - } -} - -void OpenGLState::ApplyBlending() { - if (!dirty.blend_state) { - return; - } - dirty.blend_state = false; - - if (independant_blend.enabled) { - const bool force = independant_blend.enabled != cur_state.independant_blend.enabled; - for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) { - ApplyTargetBlending(target, force); - } - } else { - ApplyGlobalBlending(); - } - cur_state.independant_blend.enabled = independant_blend.enabled; - - if (UpdateTie( - std::tie(cur_state.blend_color.red, cur_state.blend_color.green, - cur_state.blend_color.blue, cur_state.blend_color.alpha), - std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) { - glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha); - } -} - -void OpenGLState::ApplyLogicOp() { - Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled); - - if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) { - glLogicOp(logic_op.operation); - } -} - -void OpenGLState::ApplyPolygonOffset() { - if (!dirty.polygon_offset) { - return; - } - dirty.polygon_offset = false; - - Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable, - polygon_offset.fill_enable); - Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable, - polygon_offset.line_enable); - Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable, - polygon_offset.point_enable); - - if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units, - cur_state.polygon_offset.clamp), - std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) { - if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) { - glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp); - } else { - UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0, - "Unimplemented Depth polygon offset clamp."); - glPolygonOffset(polygon_offset.factor, polygon_offset.units); - } - } -} - -void OpenGLState::ApplyAlphaTest() { - Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled); - if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref), - std::tie(alpha_test.func, alpha_test.ref))) { - glAlphaFunc(alpha_test.func, alpha_test.ref); - } -} - -void OpenGLState::ApplyClipControl() { - if (UpdateTie(std::tie(cur_state.clip_control.origin, cur_state.clip_control.depth_mode), - std::tie(clip_control.origin, clip_control.depth_mode))) { - glClipControl(clip_control.origin, clip_control.depth_mode); - } -} - -void OpenGLState::ApplyRenderBuffer() { - if (cur_state.renderbuffer != renderbuffer) { - cur_state.renderbuffer = renderbuffer; - glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer); - } -} - -void OpenGLState::ApplyTextures() { - const std::size_t size = std::size(textures); - for (std::size_t i = 0; i < size; ++i) { - if (UpdateValue(cur_state.textures[i], textures[i])) { - // BindTextureUnit doesn't support binding null textures, skip those binds. - // TODO(Rodrigo): Stop using null textures - if (textures[i] != 0) { - glBindTextureUnit(static_cast<GLuint>(i), textures[i]); - } - } - } -} - -void OpenGLState::ApplySamplers() { - const std::size_t size = std::size(samplers); - for (std::size_t i = 0; i < size; ++i) { - if (UpdateValue(cur_state.samplers[i], samplers[i])) { - glBindSampler(static_cast<GLuint>(i), samplers[i]); - } - } -} - -void OpenGLState::ApplyImages() { - if (const auto update = UpdateArray(cur_state.images, images)) { - glBindImageTextures(update->first, update->second, images.data() + update->first); - } -} - -void OpenGLState::Apply() { - MICROPROFILE_SCOPE(OpenGL_State); - ApplyFramebufferState(); - ApplyVertexArrayState(); - ApplyShaderProgram(); - ApplyProgramPipeline(); - ApplyClipDistances(); - ApplyPointSize(); - ApplyFragmentColorClamp(); - ApplyMultisample(); - ApplyRasterizerDiscard(); - ApplyColorMask(); - ApplyDepthClamp(); - ApplyViewport(); - ApplyStencilTest(); - ApplySRgb(); - ApplyCulling(); - ApplyDepth(); - ApplyPrimitiveRestart(); - ApplyBlending(); - ApplyLogicOp(); - ApplyTextures(); - ApplySamplers(); - ApplyImages(); - ApplyPolygonOffset(); - ApplyAlphaTest(); - ApplyClipControl(); - ApplyRenderBuffer(); -} - -void OpenGLState::EmulateViewportWithScissor() { - auto& current = viewports[0]; - if (current.scissor.enabled) { - const GLint left = std::max(current.x, current.scissor.x); - const GLint right = - std::max(current.x + current.width, current.scissor.x + current.scissor.width); - const GLint bottom = std::max(current.y, current.scissor.y); - const GLint top = - std::max(current.y + current.height, current.scissor.y + current.scissor.height); - current.scissor.x = std::max(left, 0); - current.scissor.y = std::max(bottom, 0); - current.scissor.width = std::max(right - left, 0); - current.scissor.height = std::max(top - bottom, 0); - } else { - current.scissor.enabled = true; - current.scissor.x = current.x; - current.scissor.y = current.y; - current.scissor.width = current.width; - current.scissor.height = current.height; - } -} - -OpenGLState& OpenGLState::UnbindTexture(GLuint handle) { - for (auto& texture : textures) { - if (texture == handle) { - texture = 0; - } - } - return *this; -} - -OpenGLState& OpenGLState::ResetSampler(GLuint handle) { - for (auto& sampler : samplers) { - if (sampler == handle) { - sampler = 0; - } - } - return *this; -} - -OpenGLState& OpenGLState::ResetProgram(GLuint handle) { - if (draw.shader_program == handle) { - draw.shader_program = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetPipeline(GLuint handle) { - if (draw.program_pipeline == handle) { - draw.program_pipeline = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetVertexArray(GLuint handle) { - if (draw.vertex_array == handle) { - draw.vertex_array = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetFramebuffer(GLuint handle) { - if (draw.read_framebuffer == handle) { - draw.read_framebuffer = 0; - } - if (draw.draw_framebuffer == handle) { - draw.draw_framebuffer = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetRenderbuffer(GLuint handle) { - if (renderbuffer == handle) { - renderbuffer = 0; - } - return *this; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h deleted file mode 100644 index bce662f2c..000000000 --- a/src/video_core/renderer_opengl/gl_state.h +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <type_traits> -#include <glad/glad.h> -#include "video_core/engines/maxwell_3d.h" - -namespace OpenGL { - -class OpenGLState { -public: - struct { - bool enabled = false; // GL_FRAMEBUFFER_SRGB - } framebuffer_srgb; - - struct { - bool alpha_to_coverage = false; // GL_ALPHA_TO_COVERAGE - bool alpha_to_one = false; // GL_ALPHA_TO_ONE - } multisample_control; - - struct { - bool enabled = false; // GL_CLAMP_FRAGMENT_COLOR_ARB - } fragment_color_clamp; - - struct { - bool far_plane = false; - bool near_plane = false; - } depth_clamp; // GL_DEPTH_CLAMP - - struct { - bool enabled = false; // GL_CULL_FACE - GLenum mode = GL_BACK; // GL_CULL_FACE_MODE - GLenum front_face = GL_CCW; // GL_FRONT_FACE - } cull; - - struct { - bool test_enabled = false; // GL_DEPTH_TEST - GLboolean write_mask = GL_TRUE; // GL_DEPTH_WRITEMASK - GLenum test_func = GL_LESS; // GL_DEPTH_FUNC - } depth; - - struct { - bool enabled = false; - GLuint index = 0; - } primitive_restart; // GL_PRIMITIVE_RESTART - - bool rasterizer_discard = false; // GL_RASTERIZER_DISCARD - - struct ColorMask { - GLboolean red_enabled = GL_TRUE; - GLboolean green_enabled = GL_TRUE; - GLboolean blue_enabled = GL_TRUE; - GLboolean alpha_enabled = GL_TRUE; - }; - std::array<ColorMask, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> - color_mask; // GL_COLOR_WRITEMASK - - struct { - bool test_enabled = false; // GL_STENCIL_TEST - struct { - GLenum test_func = GL_ALWAYS; // GL_STENCIL_FUNC - GLint test_ref = 0; // GL_STENCIL_REF - GLuint test_mask = 0xFFFFFFFF; // GL_STENCIL_VALUE_MASK - GLuint write_mask = 0xFFFFFFFF; // GL_STENCIL_WRITEMASK - GLenum action_stencil_fail = GL_KEEP; // GL_STENCIL_FAIL - GLenum action_depth_fail = GL_KEEP; // GL_STENCIL_PASS_DEPTH_FAIL - GLenum action_depth_pass = GL_KEEP; // GL_STENCIL_PASS_DEPTH_PASS - } front, back; - } stencil; - - struct Blend { - bool enabled = false; // GL_BLEND - GLenum rgb_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_RGB - GLenum a_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_ALPHA - GLenum src_rgb_func = GL_ONE; // GL_BLEND_SRC_RGB - GLenum dst_rgb_func = GL_ZERO; // GL_BLEND_DST_RGB - GLenum src_a_func = GL_ONE; // GL_BLEND_SRC_ALPHA - GLenum dst_a_func = GL_ZERO; // GL_BLEND_DST_ALPHA - }; - std::array<Blend, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> blend; - - struct { - bool enabled = false; - } independant_blend; - - struct { - GLclampf red = 0.0f; - GLclampf green = 0.0f; - GLclampf blue = 0.0f; - GLclampf alpha = 0.0f; - } blend_color; // GL_BLEND_COLOR - - struct { - bool enabled = false; // GL_LOGIC_OP_MODE - GLenum operation = GL_COPY; - } logic_op; - - static constexpr std::size_t NumSamplers = 32 * 5; - static constexpr std::size_t NumImages = 8 * 5; - std::array<GLuint, NumSamplers> textures = {}; - std::array<GLuint, NumSamplers> samplers = {}; - std::array<GLuint, NumImages> images = {}; - - struct { - GLuint read_framebuffer = 0; // GL_READ_FRAMEBUFFER_BINDING - GLuint draw_framebuffer = 0; // GL_DRAW_FRAMEBUFFER_BINDING - GLuint vertex_array = 0; // GL_VERTEX_ARRAY_BINDING - GLuint shader_program = 0; // GL_CURRENT_PROGRAM - GLuint program_pipeline = 0; // GL_PROGRAM_PIPELINE_BINDING - } draw; - - struct Viewport { - GLint x = 0; - GLint y = 0; - GLint width = 0; - GLint height = 0; - GLfloat depth_range_near = 0.0f; // GL_DEPTH_RANGE - GLfloat depth_range_far = 1.0f; // GL_DEPTH_RANGE - struct { - bool enabled = false; // GL_SCISSOR_TEST - GLint x = 0; - GLint y = 0; - GLsizei width = 0; - GLsizei height = 0; - } scissor; - }; - std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports; - - struct { - bool program_control = false; // GL_PROGRAM_POINT_SIZE - bool sprite = false; // GL_POINT_SPRITE - GLfloat size = 1.0f; // GL_POINT_SIZE - } point; - - struct { - bool point_enable = false; - bool line_enable = false; - bool fill_enable = false; - GLfloat units = 0.0f; - GLfloat factor = 0.0f; - GLfloat clamp = 0.0f; - } polygon_offset; - - struct { - bool enabled = false; // GL_ALPHA_TEST - GLenum func = GL_ALWAYS; // GL_ALPHA_TEST_FUNC - GLfloat ref = 0.0f; // GL_ALPHA_TEST_REF - } alpha_test; - - std::array<bool, 8> clip_distance = {}; // GL_CLIP_DISTANCE - - struct { - GLenum origin = GL_LOWER_LEFT; - GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE; - } clip_control; - - GLuint renderbuffer{}; // GL_RENDERBUFFER_BINDING - - OpenGLState(); - - /// Get the currently active OpenGL state - static OpenGLState GetCurState() { - return cur_state; - } - - void SetDefaultViewports(); - /// Apply this state as the current OpenGL state - void Apply(); - - void ApplyFramebufferState(); - void ApplyVertexArrayState(); - void ApplyShaderProgram(); - void ApplyProgramPipeline(); - void ApplyClipDistances(); - void ApplyPointSize(); - void ApplyFragmentColorClamp(); - void ApplyMultisample(); - void ApplySRgb(); - void ApplyCulling(); - void ApplyRasterizerDiscard(); - void ApplyColorMask(); - void ApplyDepth(); - void ApplyPrimitiveRestart(); - void ApplyStencilTest(); - void ApplyViewport(); - void ApplyTargetBlending(std::size_t target, bool force); - void ApplyGlobalBlending(); - void ApplyBlending(); - void ApplyLogicOp(); - void ApplyTextures(); - void ApplySamplers(); - void ApplyImages(); - void ApplyDepthClamp(); - void ApplyPolygonOffset(); - void ApplyAlphaTest(); - void ApplyClipControl(); - void ApplyRenderBuffer(); - - /// Resets any references to the given resource - OpenGLState& UnbindTexture(GLuint handle); - OpenGLState& ResetSampler(GLuint handle); - OpenGLState& ResetProgram(GLuint handle); - OpenGLState& ResetPipeline(GLuint handle); - OpenGLState& ResetVertexArray(GLuint handle); - OpenGLState& ResetFramebuffer(GLuint handle); - OpenGLState& ResetRenderbuffer(GLuint handle); - - /// Viewport does not affects glClearBuffer so emulate viewport using scissor test - void EmulateViewportWithScissor(); - - void MarkDirtyBlendState() { - dirty.blend_state = true; - } - - void MarkDirtyStencilState() { - dirty.stencil_state = true; - } - - void MarkDirtyPolygonOffset() { - dirty.polygon_offset = true; - } - - void MarkDirtyColorMask() { - dirty.color_mask = true; - } - - void AllDirty() { - dirty.blend_state = true; - dirty.stencil_state = true; - dirty.polygon_offset = true; - dirty.color_mask = true; - } - -private: - static OpenGLState cur_state; - - struct { - bool blend_state; - bool stencil_state; - bool viewport_state; - bool polygon_offset; - bool color_mask; - } dirty{}; -}; -static_assert(std::is_trivially_copyable_v<OpenGLState>); - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp new file mode 100644 index 000000000..255ac3147 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -0,0 +1,247 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace OpenGL { + +namespace { + +using namespace Dirty; +using namespace VideoCommon::Dirty; +using Tegra::Engines::Maxwell3D; +using Regs = Maxwell3D::Regs; +using Tables = Maxwell3D::DirtyState::Tables; +using Table = Maxwell3D::DirtyState::Table; + +void SetupDirtyColorMasks(Tables& tables) { + tables[0][OFF(color_mask_common)] = ColorMaskCommon; + for (std::size_t rt = 0; rt < Regs::NumRenderTargets; ++rt) { + const std::size_t offset = OFF(color_mask) + rt * NUM(color_mask[0]); + FillBlock(tables[0], offset, NUM(color_mask[0]), ColorMask0 + rt); + } + + FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); +} + +void SetupDirtyVertexArrays(Tables& tables) { + static constexpr std::size_t num_array = 3; + static constexpr std::size_t instance_base_offset = 3; + for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { + const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); + const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); + + FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); + FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); + + const std::size_t instance_array_offset = array_offset + instance_base_offset; + tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); + tables[1][instance_array_offset] = VertexInstances; + + const std::size_t instance_offset = OFF(instanced_arrays) + i; + tables[0][instance_offset] = static_cast<u8>(VertexInstance0 + i); + tables[1][instance_offset] = VertexInstances; + } +} + +void SetupDirtyVertexFormat(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumVertexAttributes; ++i) { + const std::size_t offset = OFF(vertex_attrib_format) + i * NUM(vertex_attrib_format[0]); + FillBlock(tables[0], offset, NUM(vertex_attrib_format[0]), VertexFormat0 + i); + } + + FillBlock(tables[1], OFF(vertex_attrib_format), Regs::NumVertexAttributes, VertexFormats); +} + +void SetupDirtyViewports(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumViewports; ++i) { + const std::size_t transf_offset = OFF(viewport_transform) + i * NUM(viewport_transform[0]); + const std::size_t viewport_offset = OFF(viewports) + i * NUM(viewports[0]); + + FillBlock(tables[0], transf_offset, NUM(viewport_transform[0]), Viewport0 + i); + FillBlock(tables[0], viewport_offset, NUM(viewports[0]), Viewport0 + i); + } + + FillBlock(tables[1], OFF(viewport_transform), NUM(viewport_transform), Viewports); + FillBlock(tables[1], OFF(viewports), NUM(viewports), Viewports); + + tables[0][OFF(viewport_transform_enabled)] = ViewportTransform; + tables[1][OFF(viewport_transform_enabled)] = Viewports; +} + +void SetupDirtyScissors(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumViewports; ++i) { + const std::size_t offset = OFF(scissor_test) + i * NUM(scissor_test[0]); + FillBlock(tables[0], offset, NUM(scissor_test[0]), Scissor0 + i); + } + FillBlock(tables[1], OFF(scissor_test), NUM(scissor_test), Scissors); +} + +void SetupDirtyShaders(Tables& tables) { + FillBlock(tables[0], OFF(shader_config[0]), NUM(shader_config[0]) * Regs::MaxShaderProgram, + Shaders); +} + +void SetupDirtyPolygonModes(Tables& tables) { + tables[0][OFF(polygon_mode_front)] = PolygonModeFront; + tables[0][OFF(polygon_mode_back)] = PolygonModeBack; + + tables[1][OFF(polygon_mode_front)] = PolygonModes; + tables[1][OFF(polygon_mode_back)] = PolygonModes; + tables[0][OFF(fill_rectangle)] = PolygonModes; +} + +void SetupDirtyDepthTest(Tables& tables) { + auto& table = tables[0]; + table[OFF(depth_test_enable)] = DepthTest; + table[OFF(depth_write_enabled)] = DepthMask; + table[OFF(depth_test_func)] = DepthTest; +} + +void SetupDirtyStencilTest(Tables& tables) { + static constexpr std::array offsets = { + OFF(stencil_enable), OFF(stencil_front_func_func), OFF(stencil_front_func_ref), + OFF(stencil_front_func_mask), OFF(stencil_front_op_fail), OFF(stencil_front_op_zfail), + OFF(stencil_front_op_zpass), OFF(stencil_front_mask), OFF(stencil_two_side_enable), + OFF(stencil_back_func_func), OFF(stencil_back_func_ref), OFF(stencil_back_func_mask), + OFF(stencil_back_op_fail), OFF(stencil_back_op_zfail), OFF(stencil_back_op_zpass), + OFF(stencil_back_mask)}; + for (const auto offset : offsets) { + tables[0][offset] = StencilTest; + } +} + +void SetupDirtyAlphaTest(Tables& tables) { + auto& table = tables[0]; + table[OFF(alpha_test_ref)] = AlphaTest; + table[OFF(alpha_test_func)] = AlphaTest; + table[OFF(alpha_test_enabled)] = AlphaTest; +} + +void SetupDirtyBlend(Tables& tables) { + FillBlock(tables[0], OFF(blend_color), NUM(blend_color), BlendColor); + + tables[0][OFF(independent_blend_enable)] = BlendIndependentEnabled; + + for (std::size_t i = 0; i < Regs::NumRenderTargets; ++i) { + const std::size_t offset = OFF(independent_blend) + i * NUM(independent_blend[0]); + FillBlock(tables[0], offset, NUM(independent_blend[0]), BlendState0 + i); + + tables[0][OFF(blend.enable) + i] = static_cast<u8>(BlendState0 + i); + } + FillBlock(tables[1], OFF(independent_blend), NUM(independent_blend), BlendStates); + FillBlock(tables[1], OFF(blend), NUM(blend), BlendStates); +} + +void SetupDirtyPrimitiveRestart(Tables& tables) { + FillBlock(tables[0], OFF(primitive_restart), NUM(primitive_restart), PrimitiveRestart); +} + +void SetupDirtyPolygonOffset(Tables& tables) { + auto& table = tables[0]; + table[OFF(polygon_offset_fill_enable)] = PolygonOffset; + table[OFF(polygon_offset_line_enable)] = PolygonOffset; + table[OFF(polygon_offset_point_enable)] = PolygonOffset; + table[OFF(polygon_offset_factor)] = PolygonOffset; + table[OFF(polygon_offset_units)] = PolygonOffset; + table[OFF(polygon_offset_clamp)] = PolygonOffset; +} + +void SetupDirtyMultisampleControl(Tables& tables) { + FillBlock(tables[0], OFF(multisample_control), NUM(multisample_control), MultisampleControl); +} + +void SetupDirtyRasterizeEnable(Tables& tables) { + tables[0][OFF(rasterize_enable)] = RasterizeEnable; +} + +void SetupDirtyFramebufferSRGB(Tables& tables) { + tables[0][OFF(framebuffer_srgb)] = FramebufferSRGB; +} + +void SetupDirtyLogicOp(Tables& tables) { + FillBlock(tables[0], OFF(logic_op), NUM(logic_op), LogicOp); +} + +void SetupDirtyFragmentClampColor(Tables& tables) { + tables[0][OFF(frag_color_clamp)] = FragmentClampColor; +} + +void SetupDirtyPointSize(Tables& tables) { + tables[0][OFF(vp_point_size)] = PointSize; + tables[0][OFF(point_size)] = PointSize; + tables[0][OFF(point_sprite_enable)] = PointSize; +} + +void SetupDirtyClipControl(Tables& tables) { + auto& table = tables[0]; + table[OFF(screen_y_control)] = ClipControl; + table[OFF(depth_mode)] = ClipControl; +} + +void SetupDirtyDepthClampEnabled(Tables& tables) { + tables[0][OFF(view_volume_clip_control)] = DepthClampEnabled; +} + +void SetupDirtyMisc(Tables& tables) { + auto& table = tables[0]; + + table[OFF(clip_distance_enabled)] = ClipDistances; + + table[OFF(front_face)] = FrontFace; + + table[OFF(cull_test_enabled)] = CullTest; + table[OFF(cull_face)] = CullTest; +} + +} // Anonymous namespace + +StateTracker::StateTracker(Core::System& system) : system{system} {} + +void StateTracker::Initialize() { + auto& dirty = system.GPU().Maxwell3D().dirty; + auto& tables = dirty.tables; + SetupDirtyRenderTargets(tables); + SetupDirtyColorMasks(tables); + SetupDirtyViewports(tables); + SetupDirtyScissors(tables); + SetupDirtyVertexArrays(tables); + SetupDirtyVertexFormat(tables); + SetupDirtyShaders(tables); + SetupDirtyPolygonModes(tables); + SetupDirtyDepthTest(tables); + SetupDirtyStencilTest(tables); + SetupDirtyAlphaTest(tables); + SetupDirtyBlend(tables); + SetupDirtyPrimitiveRestart(tables); + SetupDirtyPolygonOffset(tables); + SetupDirtyMultisampleControl(tables); + SetupDirtyRasterizeEnable(tables); + SetupDirtyFramebufferSRGB(tables); + SetupDirtyLogicOp(tables); + SetupDirtyFragmentClampColor(tables); + SetupDirtyPointSize(tables); + SetupDirtyClipControl(tables); + SetupDirtyDepthClampEnabled(tables); + SetupDirtyMisc(tables); + + auto& store = dirty.on_write_stores; + store[VertexBuffers] = true; + for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { + store[VertexBuffer0 + i] = true; + } +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h new file mode 100644 index 000000000..b882d75c3 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -0,0 +1,215 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <limits> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" + +namespace Core { +class System; +} + +namespace OpenGL { + +namespace Dirty { + +enum : u8 { + First = VideoCommon::Dirty::LastCommonEntry, + + VertexFormats, + VertexFormat0, + VertexFormat31 = VertexFormat0 + 31, + + VertexBuffers, + VertexBuffer0, + VertexBuffer31 = VertexBuffer0 + 31, + + VertexInstances, + VertexInstance0, + VertexInstance31 = VertexInstance0 + 31, + + ViewportTransform, + Viewports, + Viewport0, + Viewport15 = Viewport0 + 15, + + Scissors, + Scissor0, + Scissor15 = Scissor0 + 15, + + ColorMaskCommon, + ColorMasks, + ColorMask0, + ColorMask7 = ColorMask0 + 7, + + BlendColor, + BlendIndependentEnabled, + BlendStates, + BlendState0, + BlendState7 = BlendState0 + 7, + + Shaders, + ClipDistances, + + PolygonModes, + PolygonModeFront, + PolygonModeBack, + + ColorMask, + FrontFace, + CullTest, + DepthMask, + DepthTest, + StencilTest, + AlphaTest, + PrimitiveRestart, + PolygonOffset, + MultisampleControl, + RasterizeEnable, + FramebufferSRGB, + LogicOp, + FragmentClampColor, + PointSize, + ClipControl, + DepthClampEnabled, + + Last +}; +static_assert(Last <= std::numeric_limits<u8>::max()); + +} // namespace Dirty + +class StateTracker { +public: + explicit StateTracker(Core::System& system); + + void Initialize(); + + void BindIndexBuffer(GLuint new_index_buffer) { + if (index_buffer == new_index_buffer) { + return; + } + index_buffer = new_index_buffer; + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, new_index_buffer); + } + + void NotifyScreenDrawVertexArray() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::VertexFormats] = true; + flags[OpenGL::Dirty::VertexFormat0 + 0] = true; + flags[OpenGL::Dirty::VertexFormat0 + 1] = true; + + flags[OpenGL::Dirty::VertexBuffers] = true; + flags[OpenGL::Dirty::VertexBuffer0] = true; + + flags[OpenGL::Dirty::VertexInstances] = true; + flags[OpenGL::Dirty::VertexInstance0 + 0] = true; + flags[OpenGL::Dirty::VertexInstance0 + 1] = true; + } + + void NotifyPolygonModes() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::PolygonModes] = true; + flags[OpenGL::Dirty::PolygonModeFront] = true; + flags[OpenGL::Dirty::PolygonModeBack] = true; + } + + void NotifyViewport0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::Viewports] = true; + flags[OpenGL::Dirty::Viewport0] = true; + } + + void NotifyScissor0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::Scissors] = true; + flags[OpenGL::Dirty::Scissor0] = true; + } + + void NotifyColorMask0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::ColorMasks] = true; + flags[OpenGL::Dirty::ColorMask0] = true; + } + + void NotifyBlend0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::BlendStates] = true; + flags[OpenGL::Dirty::BlendState0] = true; + } + + void NotifyFramebuffer() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[VideoCommon::Dirty::RenderTargets] = true; + } + + void NotifyFrontFace() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::FrontFace] = true; + } + + void NotifyCullTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::CullTest] = true; + } + + void NotifyDepthMask() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::DepthMask] = true; + } + + void NotifyDepthTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::DepthTest] = true; + } + + void NotifyStencilTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::StencilTest] = true; + } + + void NotifyPolygonOffset() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::PolygonOffset] = true; + } + + void NotifyRasterizeEnable() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::RasterizeEnable] = true; + } + + void NotifyFramebufferSRGB() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::FramebufferSRGB] = true; + } + + void NotifyLogicOp() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::LogicOp] = true; + } + + void NotifyClipControl() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::ClipControl] = true; + } + + void NotifyAlphaTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::AlphaTest] = true; + } + +private: + Core::System& system; + + GLuint index_buffer = 0; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 35ba334e4..6ec328c53 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -7,7 +7,6 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index cf934b0d8..f424e3000 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -10,7 +10,7 @@ #include "core/core.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" #include "video_core/texture_cache/surface_base.h" @@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false}, // RGBA16S {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI @@ -397,6 +398,7 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p const bool is_proxy) : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { target = GetTextureTarget(params.target); + format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format; if (!is_proxy) { texture_view = CreateTextureView(); } @@ -467,25 +469,20 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou } OGLTextureView CachedSurfaceView::CreateTextureView() const { - const auto& owner_params = surface.GetSurfaceParams(); OGLTextureView texture_view; texture_view.Create(); - const GLuint handle{texture_view.handle}; - const FormatTuple& tuple{GetFormatTuple(owner_params.pixel_format)}; - - glTextureView(handle, target, surface.texture.handle, tuple.internal_format, params.base_level, + glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, params.num_levels, params.base_layer, params.num_layers); - - ApplyTextureDefaults(owner_params, handle); + ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; } TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device) - : TextureCacheBase{system, rasterizer} { + const Device& device, StateTracker& state_tracker) + : TextureCacheBase{system, rasterizer}, state_tracker{state_tracker} { src_framebuffer.Create(); dst_framebuffer.Create(); } @@ -519,25 +516,26 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; + UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ - prev_state.AllDirty(); - prev_state.Apply(); - }); - - OpenGLState state; - state.draw.read_framebuffer = src_framebuffer.handle; - state.draw.draw_framebuffer = dst_framebuffer.handle; - state.framebuffer_srgb.enabled = dst_params.srgb_conversion; - state.AllDirty(); - state.Apply(); + state_tracker.NotifyScissor0(); + state_tracker.NotifyFramebuffer(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); - u32 buffers{}; + if (dst_params.srgb_conversion) { + glEnable(GL_FRAMEBUFFER_SRGB); + } else { + glDisable(GL_FRAMEBUFFER_SRGB); + } + glDisable(GL_RASTERIZER_DISCARD); + glDisablei(GL_SCISSOR_TEST, 0); - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + glBindFramebuffer(GL_READ_FRAMEBUFFER, src_framebuffer.handle); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer.handle); + GLenum buffers = 0; if (src_params.type == SurfaceType::ColorTexture) { src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER); glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8e13ab38b..6658c6ffd 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -27,6 +27,7 @@ using VideoCommon::ViewParams; class CachedSurfaceView; class CachedSurface; class TextureCacheOpenGL; +class StateTracker; using Surface = std::shared_ptr<CachedSurface>; using View = std::shared_ptr<CachedSurfaceView>; @@ -96,6 +97,10 @@ public: return texture_view.handle; } + GLenum GetFormat() const { + return format; + } + const SurfaceParams& GetSurfaceParams() const { return surface.GetSurfaceParams(); } @@ -113,6 +118,7 @@ private: CachedSurface& surface; GLenum target{}; + GLenum format{}; OGLTextureView texture_view; u32 swizzle{}; @@ -122,7 +128,7 @@ private: class TextureCacheOpenGL final : public TextureCacheBase { public: explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device); + const Device& device, StateTracker& state_tracker); ~TextureCacheOpenGL(); protected: @@ -139,6 +145,8 @@ protected: private: GLuint FetchPBO(std::size_t buffer_size); + StateTracker& state_tracker; + OGLFramebuffer src_framebuffer; OGLFramebuffer dst_framebuffer; std::unordered_map<u32, OGLBuffer> copy_pbo_cache; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 7ed505628..89f0e04ef 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -92,8 +92,32 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { } case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: return GL_UNSIGNED_BYTE; + case Maxwell::VertexAttribute::Size::Size_16: + case Maxwell::VertexAttribute::Size::Size_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return GL_UNSIGNED_SHORT; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + return {}; + } + case Maxwell::VertexAttribute::Type::SignedScaled: + switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8: + case Maxwell::VertexAttribute::Size::Size_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return GL_BYTE; + case Maxwell::VertexAttribute::Size::Size_16: + case Maxwell::VertexAttribute::Size::Size_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return GL_SHORT; default: LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; @@ -401,24 +425,24 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { return GL_KEEP; } -inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) { +inline GLenum FrontFace(Maxwell::FrontFace front_face) { switch (front_face) { - case Maxwell::Cull::FrontFace::ClockWise: + case Maxwell::FrontFace::ClockWise: return GL_CW; - case Maxwell::Cull::FrontFace::CounterClockWise: + case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); return GL_CCW; } -inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) { +inline GLenum CullFace(Maxwell::CullFace cull_face) { switch (cull_face) { - case Maxwell::Cull::CullFace::Front: + case Maxwell::CullFace::Front: return GL_FRONT; - case Maxwell::Cull::CullFace::Back: + case Maxwell::CullFace::Back: return GL_BACK; - case Maxwell::Cull::CullFace::FrontAndBack: + case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); @@ -464,5 +488,18 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { return GL_COPY; } +inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { + switch (polygon_mode) { + case Maxwell::PolygonMode::Point: + return GL_POINT; + case Maxwell::PolygonMode::Line: + return GL_LINE; + case Maxwell::PolygonMode::Fill: + return GL_FILL; + } + UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode)); + return GL_FILL; +} + } // namespace MaxwellToGL } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index a4340b502..fca5e3ec0 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,8 +5,11 @@ #include <algorithm> #include <cstddef> #include <cstdlib> +#include <cstring> #include <memory> + #include <glad/glad.h> + #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" @@ -20,10 +23,13 @@ #include "core/telemetry_session.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/renderer_opengl.h" namespace OpenGL { +namespace { + // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have // to wait on available presentation frames. constexpr std::size_t SWAP_CHAIN_SIZE = 3; @@ -40,133 +46,13 @@ struct Frame { bool is_srgb{}; /// Framebuffer is sRGB or RGB }; -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - OpenGLState prev_state = OpenGLState::GetCurState(); - OpenGLState state = OpenGLState::GetCurState(); - - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - state.renderbuffer = frame->color.handle; - state.Apply(); - glRenderbufferStorage(GL_RENDERBUFFER, frame->is_srgb ? GL_SRGB8 : GL_RGB8, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - state.draw.read_framebuffer = frame->render.handle; - state.draw.draw_framebuffer = frame->render.handle; - state.Apply(); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - prev_state.Apply(); - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } +constexpr char VERTEX_SHADER[] = R"( +#version 430 core - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } +out gl_PerVertex { + vec4 gl_Position; }; -namespace { - -constexpr char vertex_shader[] = R"( -#version 430 core - layout (location = 0) in vec2 vert_position; layout (location = 1) in vec2 vert_tex_coord; layout (location = 0) out vec2 frag_tex_coord; @@ -187,7 +73,7 @@ void main() { } )"; -constexpr char fragment_shader[] = R"( +constexpr char FRAGMENT_SHADER[] = R"( #version 430 core layout (location = 0) in vec2 frag_tex_coord; @@ -196,7 +82,7 @@ layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; void main() { - color = texture(color_texture, frag_tex_coord); + color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); } )"; @@ -205,13 +91,31 @@ constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; struct ScreenRectVertex { - constexpr ScreenRectVertex(GLfloat x, GLfloat y, GLfloat u, GLfloat v) - : position{{x, y}}, tex_coord{{u, v}} {} + constexpr ScreenRectVertex(u32 x, u32 y, GLfloat u, GLfloat v) + : position{{static_cast<GLfloat>(x), static_cast<GLfloat>(y)}}, tex_coord{{u, v}} {} std::array<GLfloat, 2> position; std::array<GLfloat, 2> tex_coord; }; +/// Returns true if any debug tool is attached +bool HasDebugTool() { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + if (nsight) { + return true; + } + + GLint num_extensions; + glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); + for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { + const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); + if (!std::strcmp(name, "GL_EXT_debug_tool")) { + return true; + } + } + return false; +} + /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -295,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace +/** + * For smooth Vsync rendering, we want to always present the latest frame that the core generates, + * but also make sure that rendering happens at the pace that the frontend dictates. This is a + * helper class that the renderer uses to sync frames between the render thread and the presentation + * thread + */ +class FrameMailbox { +public: + std::mutex swap_chain_lock; + std::condition_variable present_cv; + std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; + std::queue<Frame*> free_queue; + std::deque<Frame*> present_queue; + Frame* previous_frame{}; + + FrameMailbox() : has_debug_tool{HasDebugTool()} { + for (auto& frame : swap_chain) { + free_queue.push(&frame); + } + } + + ~FrameMailbox() { + // lock the mutex and clear out the present and free_queues and notify any people who are + // blocked to prevent deadlock on shutdown + std::scoped_lock lock{swap_chain_lock}; + std::queue<Frame*>().swap(free_queue); + present_queue.clear(); + present_cv.notify_all(); + } + + void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { + frame->present.Release(); + frame->present.Create(); + GLint previous_draw_fbo{}; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); + glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); + } + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); + frame->color_reloaded = false; + } + + void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { + // Recreate the color texture attachment + frame->color.Release(); + frame->color.Create(); + const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; + glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); + + // Recreate the FBO for the render target + frame->render.Release(); + frame->render.Create(); + glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); + } + + frame->width = width; + frame->height = height; + frame->color_reloaded = true; + } + + Frame* GetRenderFrame() { + std::unique_lock lock{swap_chain_lock}; + + // If theres no free frames, we will reuse the oldest render frame + if (free_queue.empty()) { + auto frame = present_queue.back(); + present_queue.pop_back(); + return frame; + } + + Frame* frame = free_queue.front(); + free_queue.pop(); + return frame; + } + + void ReleaseRenderFrame(Frame* frame) { + std::unique_lock lock{swap_chain_lock}; + present_queue.push_front(frame); + present_cv.notify_one(); + + DebugNotifyNextFrame(); + } + + Frame* TryGetPresentFrame(int timeout_ms) { + DebugWaitForNextFrame(); + + std::unique_lock lock{swap_chain_lock}; + // wait for new entries in the present_queue + present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), + [&] { return !present_queue.empty(); }); + if (present_queue.empty()) { + // timed out waiting for a frame to draw so return the previous frame + return previous_frame; + } + + // free the previous frame and add it back to the free queue + if (previous_frame) { + free_queue.push(previous_frame); + } + + // the newest entries are pushed to the front of the queue + Frame* frame = present_queue.front(); + present_queue.pop_front(); + // remove all old entries from the present queue and move them back to the free_queue + for (auto f : present_queue) { + free_queue.push(f); + } + present_queue.clear(); + previous_frame = frame; + return frame; + } + +private: + std::mutex debug_synch_mutex; + std::condition_variable debug_synch_condition; + std::atomic_int frame_for_debug{}; + const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step + + /// Signal that a new frame is available (called from GPU thread) + void DebugNotifyNextFrame() { + if (!has_debug_tool) { + return; + } + frame_for_debug++; + std::lock_guard lock{debug_synch_mutex}; + debug_synch_condition.notify_one(); + } + + /// Wait for a new frame to be available (called from presentation thread) + void DebugWaitForNextFrame() { + if (!has_debug_tool) { + return; + } + const int last_frame = frame_for_debug; + std::unique_lock lock{debug_synch_mutex}; + debug_synch_condition.wait(lock, + [this, last_frame] { return frame_for_debug > last_frame; }); + } +}; + RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system) : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system}, frame_mailbox{std::make_unique<FrameMailbox>()} {} @@ -311,11 +362,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { return; } - // Maintain the rasterizer's state as a priority - OpenGLState prev_state = OpenGLState::GetCurState(); - state.AllDirty(); - state.Apply(); - PrepareRendertarget(framebuffer); RenderScreenshot(); @@ -358,8 +404,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { frame->is_srgb = screen_info.display_srgb; frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height); } - state.draw.draw_framebuffer = frame->render.handle; - state.Apply(); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle); DrawScreen(layout); // Create a fence for the frontend to wait on and swap this frame to OffTex frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); @@ -368,10 +413,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { m_current_frame++; rasterizer->TickFrame(); } - - // Restore the rasterizer state - prev_state.AllDirty(); - prev_state.Apply(); } void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { @@ -442,31 +483,24 @@ void RendererOpenGL::InitOpenGLObjects() { glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, 0.0f); - // Link shaders and get variable locations - shader.CreateFromSource(vertex_shader, nullptr, fragment_shader); - state.draw.shader_program = shader.handle; - state.AllDirty(); - state.Apply(); + // Create shader programs + OGLShader vertex_shader; + vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER); + + OGLShader fragment_shader; + fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER); + + vertex_program.Create(true, false, vertex_shader.handle); + fragment_program.Create(true, false, fragment_shader.handle); + + // Create program pipeline + program_manager.Create(); // Generate VBO handle for drawing vertex_buffer.Create(); - // Generate VAO - vertex_array.Create(); - state.draw.vertex_array = vertex_array.handle; - // Attach vertex data to VAO glNamedBufferData(vertex_buffer.handle, sizeof(ScreenRectVertex) * 4, nullptr, GL_STREAM_DRAW); - glVertexArrayAttribFormat(vertex_array.handle, PositionLocation, 2, GL_FLOAT, GL_FALSE, - offsetof(ScreenRectVertex, position)); - glVertexArrayAttribFormat(vertex_array.handle, TexCoordLocation, 2, GL_FLOAT, GL_FALSE, - offsetof(ScreenRectVertex, tex_coord)); - glVertexArrayAttribBinding(vertex_array.handle, PositionLocation, 0); - glVertexArrayAttribBinding(vertex_array.handle, TexCoordLocation, 0); - glEnableVertexArrayAttrib(vertex_array.handle, PositionLocation); - glEnableVertexArrayAttrib(vertex_array.handle, TexCoordLocation); - glVertexArrayVertexBuffer(vertex_array.handle, 0, vertex_buffer.handle, 0, - sizeof(ScreenRectVertex)); // Allocate textures for the screen screen_info.texture.resource.Create(GL_TEXTURE_2D); @@ -499,7 +533,8 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info); + rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, + program_manager, state_tracker); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, @@ -538,8 +573,19 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height); } -void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, - float h) { +void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { + if (renderer_settings.set_background_color) { + // Update background color before drawing + glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, + 0.0f); + } + + // Set projection matrix + const std::array ortho_matrix = + MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); + glProgramUniformMatrix3x2fv(vertex_program.handle, ModelViewMatrixLocation, 1, GL_FALSE, + std::data(ortho_matrix)); + const auto& texcoords = screen_info.display_texcoords; auto left = texcoords.left; auto right = texcoords.right; @@ -571,46 +617,79 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, static_cast<f32>(screen_info.texture.height); } + const auto& screen = layout.screen; const std::array vertices = { - ScreenRectVertex(x, y, texcoords.top * scale_u, left * scale_v), - ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left * scale_v), - ScreenRectVertex(x, y + h, texcoords.top * scale_u, right * scale_v), - ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v), + ScreenRectVertex(screen.left, screen.top, texcoords.top * scale_u, left * scale_v), + ScreenRectVertex(screen.right, screen.top, texcoords.bottom * scale_u, left * scale_v), + ScreenRectVertex(screen.left, screen.bottom, texcoords.top * scale_u, right * scale_v), + ScreenRectVertex(screen.right, screen.bottom, texcoords.bottom * scale_u, right * scale_v), }; - - state.textures[0] = screen_info.display_texture; - state.framebuffer_srgb.enabled = screen_info.display_srgb; - state.AllDirty(); - state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices)); - glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); - // Restore default state - state.framebuffer_srgb.enabled = false; - state.textures[0] = 0; - state.AllDirty(); - state.Apply(); -} -void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { - if (renderer_settings.set_background_color) { - // Update background color before drawing - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + // TODO: Signal state tracker about these changes + state_tracker.NotifyScreenDrawVertexArray(); + state_tracker.NotifyPolygonModes(); + state_tracker.NotifyViewport0(); + state_tracker.NotifyScissor0(); + state_tracker.NotifyColorMask0(); + state_tracker.NotifyBlend0(); + state_tracker.NotifyFramebuffer(); + state_tracker.NotifyFrontFace(); + state_tracker.NotifyCullTest(); + state_tracker.NotifyDepthTest(); + state_tracker.NotifyStencilTest(); + state_tracker.NotifyPolygonOffset(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); + state_tracker.NotifyLogicOp(); + state_tracker.NotifyClipControl(); + state_tracker.NotifyAlphaTest(); + + program_manager.UseVertexShader(vertex_program.handle); + program_manager.UseGeometryShader(0); + program_manager.UseFragmentShader(fragment_program.handle); + program_manager.BindGraphicsPipeline(); + + glEnable(GL_CULL_FACE); + if (screen_info.display_srgb) { + glEnable(GL_FRAMEBUFFER_SRGB); + } else { + glDisable(GL_FRAMEBUFFER_SRGB); } + glDisable(GL_COLOR_LOGIC_OP); + glDisable(GL_DEPTH_TEST); + glDisable(GL_STENCIL_TEST); + glDisable(GL_POLYGON_OFFSET_FILL); + glDisable(GL_RASTERIZER_DISCARD); + glDisable(GL_ALPHA_TEST); + glDisablei(GL_BLEND, 0); + glDisablei(GL_SCISSOR_TEST, 0); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glCullFace(GL_BACK); + glFrontFace(GL_CW); + glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), + static_cast<GLfloat>(layout.height)); + glDepthRangeIndexed(0, 0.0, 0.0); + + glEnableVertexAttribArray(PositionLocation); + glEnableVertexAttribArray(TexCoordLocation); + glVertexAttribDivisor(PositionLocation, 0); + glVertexAttribDivisor(TexCoordLocation, 0); + glVertexAttribFormat(PositionLocation, 2, GL_FLOAT, GL_FALSE, + offsetof(ScreenRectVertex, position)); + glVertexAttribFormat(TexCoordLocation, 2, GL_FLOAT, GL_FALSE, + offsetof(ScreenRectVertex, tex_coord)); + glVertexAttribBinding(PositionLocation, 0); + glVertexAttribBinding(TexCoordLocation, 0); + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + + glBindTextureUnit(0, screen_info.display_texture); + glBindSampler(0, 0); - const auto& screen = layout.screen; - - glViewport(0, 0, layout.width, layout.height); glClear(GL_COLOR_BUFFER_BIT); - - // Set projection matrix - const std::array ortho_matrix = - MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); - glUniformMatrix3x2fv(ModelViewMatrixLocation, 1, GL_FALSE, ortho_matrix.data()); - - DrawScreenTriangles(screen_info, static_cast<float>(screen.left), - static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()), - static_cast<float>(screen.GetHeight())); + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); } void RendererOpenGL::TryPresent(int timeout_ms) { @@ -653,13 +732,14 @@ void RendererOpenGL::RenderScreenshot() { return; } + GLint old_read_fb; + GLint old_draw_fb; + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); + // Draw the current frame to the screenshot framebuffer screenshot_framebuffer.Create(); - GLuint old_read_fb = state.draw.read_framebuffer; - GLuint old_draw_fb = state.draw.draw_framebuffer; - state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; - state.AllDirty(); - state.Apply(); + glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -676,12 +756,11 @@ void RendererOpenGL::RenderScreenshot() { renderer_settings.screenshot_bits); screenshot_framebuffer.Release(); - state.draw.read_framebuffer = old_read_fb; - state.draw.draw_framebuffer = old_draw_fb; - state.AllDirty(); - state.Apply(); glDeleteRenderbuffers(1, &renderbuffer); + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); + renderer_settings.screenshot_complete_callback(); renderer_settings.screenshot_requested = false; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index d45e69cbc..33073ce5b 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,7 +10,8 @@ #include "common/math_util.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" namespace Core { class System; @@ -76,8 +77,6 @@ private: /// Draws the emulated screens to the emulator window. void DrawScreen(const Layout::FramebufferLayout& layout); - void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h); - void RenderScreenshot(); /// Loads framebuffer from emulated memory into the active OpenGL texture. @@ -93,17 +92,20 @@ private: Core::Frontend::EmuWindow& emu_window; Core::System& system; - OpenGLState state; + StateTracker state_tracker{system}; // OpenGL object IDs - OGLVertexArray vertex_array; OGLBuffer vertex_buffer; - OGLProgram shader; + OGLProgram vertex_program; + OGLProgram fragment_program; OGLFramebuffer screenshot_framebuffer; /// Display information for Switch screen ScreenInfo screen_info; + /// Global dummy shader pipeline + GLShader::ProgramManager program_manager; + /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index ac99e6385..b751086fa 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -9,6 +9,7 @@ #include <glad/glad.h> #include "common/common_types.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { @@ -20,12 +21,12 @@ struct VertexArrayPushBuffer::Entry { GLsizei stride{}; }; -VertexArrayPushBuffer::VertexArrayPushBuffer() = default; +VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker) + : state_tracker{state_tracker} {} VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; -void VertexArrayPushBuffer::Setup(GLuint vao_) { - vao = vao_; +void VertexArrayPushBuffer::Setup() { index_buffer = nullptr; vertex_buffers.clear(); } @@ -41,13 +42,11 @@ void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* void VertexArrayPushBuffer::Bind() { if (index_buffer) { - glVertexArrayElementBuffer(vao, *index_buffer); + state_tracker.BindIndexBuffer(*index_buffer); } - // TODO(Rodrigo): Find a way to ARB_multi_bind this for (const auto& entry : vertex_buffers) { - glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset, - entry.stride); + glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride); } } diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 3ad7c02d4..47ee3177b 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,12 +11,14 @@ namespace OpenGL { +class StateTracker; + class VertexArrayPushBuffer final { public: - explicit VertexArrayPushBuffer(); + explicit VertexArrayPushBuffer(StateTracker& state_tracker); ~VertexArrayPushBuffer(); - void Setup(GLuint vao_); + void Setup(); void SetIndexBuffer(const GLuint* buffer); @@ -28,7 +30,8 @@ public: private: struct Entry; - GLuint vao{}; + StateTracker& state_tracker; + const GLuint* index_buffer{}; std::vector<Entry> vertex_buffers; }; diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 4e3ff231e..2bb376555 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -112,19 +112,18 @@ constexpr FixedPipelineState::Rasterizer GetRasterizerState(const Maxwell& regs) const auto& clip = regs.view_volume_clip_control; const bool depth_clamp_enabled = clip.depth_clamp_near == 1 || clip.depth_clamp_far == 1; - Maxwell::Cull::FrontFace front_face = regs.cull.front_face; + Maxwell::FrontFace front_face = regs.front_face; if (regs.screen_y_control.triangle_rast_flip != 0 && regs.viewport_transform[0].scale_y > 0.0f) { - if (front_face == Maxwell::Cull::FrontFace::CounterClockWise) - front_face = Maxwell::Cull::FrontFace::ClockWise; - else if (front_face == Maxwell::Cull::FrontFace::ClockWise) - front_face = Maxwell::Cull::FrontFace::CounterClockWise; + if (front_face == Maxwell::FrontFace::CounterClockWise) + front_face = Maxwell::FrontFace::ClockWise; + else if (front_face == Maxwell::FrontFace::ClockWise) + front_face = Maxwell::FrontFace::CounterClockWise; } const bool gl_ndc = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; - return FixedPipelineState::Rasterizer(regs.cull.enabled, depth_bias_enabled, - depth_clamp_enabled, gl_ndc, regs.cull.cull_face, - front_face); + return FixedPipelineState::Rasterizer(regs.cull_test_enabled, depth_bias_enabled, + depth_clamp_enabled, gl_ndc, regs.cull_face, front_face); } } // Anonymous namespace diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h index 87056ef37..4c8ba7f90 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h @@ -171,8 +171,8 @@ struct FixedPipelineState { struct Rasterizer { constexpr Rasterizer(bool cull_enable, bool depth_bias_enable, bool depth_clamp_enable, - bool ndc_minus_one_to_one, Maxwell::Cull::CullFace cull_face, - Maxwell::Cull::FrontFace front_face) + bool ndc_minus_one_to_one, Maxwell::CullFace cull_face, + Maxwell::FrontFace front_face) : cull_enable{cull_enable}, depth_bias_enable{depth_bias_enable}, depth_clamp_enable{depth_clamp_enable}, ndc_minus_one_to_one{ndc_minus_one_to_one}, cull_face{cull_face}, front_face{front_face} {} @@ -182,8 +182,8 @@ struct FixedPipelineState { bool depth_bias_enable; bool depth_clamp_enable; bool ndc_minus_one_to_one; - Maxwell::Cull::CullFace cull_face; - Maxwell::Cull::FrontFace front_face; + Maxwell::CullFace cull_face; + Maxwell::FrontFace front_face; std::size_t Hash() const noexcept; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index ef66dd141..f93447610 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -120,11 +120,12 @@ struct FormatTuple { {vk::Format::eA8B8G8R8UintPack32, Attachable | Storage}, // ABGR8UI {vk::Format::eB5G6R5UnormPack16, {}}, // B5G6R5U {vk::Format::eA2B10G10R10UnormPack32, Attachable | Storage}, // A2B10G10R10U - {vk::Format::eA1R5G5B5UnormPack16, Attachable | Storage}, // A1B5G5R5U (flipped with swizzle) + {vk::Format::eA1R5G5B5UnormPack16, Attachable}, // A1B5G5R5U (flipped with swizzle) {vk::Format::eR8Unorm, Attachable | Storage}, // R8U {vk::Format::eR8Uint, Attachable | Storage}, // R8UI {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U + {vk::Format::eR16G16B16A16Snorm, Attachable | Storage}, // RGBA16S {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI @@ -256,6 +257,8 @@ vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) { return vk::ShaderStageFlagBits::eGeometry; case Tegra::Engines::ShaderType::Fragment: return vk::ShaderStageFlagBits::eFragment; + case Tegra::Engines::ShaderType::Compute: + return vk::ShaderStageFlagBits::eCompute; } UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage)); return {}; @@ -331,6 +334,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR16G16B16Unorm; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return vk::Format::eR16G16B16A16Unorm; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return vk::Format::eA2B10G10R10UnormPack32; default: break; } @@ -364,6 +369,10 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32: + return vk::Format::eR32G32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return vk::Format::eR32G32B32Uint; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return vk::Format::eR32G32B32A32Uint; default: @@ -371,8 +380,22 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr } case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (size) { + case Maxwell::VertexAttribute::Size::Size_8: + return vk::Format::eR8Uscaled; case Maxwell::VertexAttribute::Size::Size_8_8: return vk::Format::eR8G8Uscaled; + case Maxwell::VertexAttribute::Size::Size_8_8_8: + return vk::Format::eR8G8B8Uscaled; + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return vk::Format::eR8G8B8A8Uscaled; + case Maxwell::VertexAttribute::Size::Size_16: + return vk::Format::eR16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16: + return vk::Format::eR16G16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16_16: + return vk::Format::eR16G16B16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return vk::Format::eR16G16B16A16Uscaled; default: break; } @@ -572,24 +595,24 @@ vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor) { return {}; } -vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face) { +vk::FrontFace FrontFace(Maxwell::FrontFace front_face) { switch (front_face) { - case Maxwell::Cull::FrontFace::ClockWise: + case Maxwell::FrontFace::ClockWise: return vk::FrontFace::eClockwise; - case Maxwell::Cull::FrontFace::CounterClockWise: + case Maxwell::FrontFace::CounterClockWise: return vk::FrontFace::eCounterClockwise; } UNIMPLEMENTED_MSG("Unimplemented front face={}", static_cast<u32>(front_face)); return {}; } -vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face) { +vk::CullModeFlags CullFace(Maxwell::CullFace cull_face) { switch (cull_face) { - case Maxwell::Cull::CullFace::Front: + case Maxwell::CullFace::Front: return vk::CullModeFlagBits::eFront; - case Maxwell::Cull::CullFace::Back: + case Maxwell::CullFace::Back: return vk::CullModeFlagBits::eBack; - case Maxwell::Cull::CullFace::FrontAndBack: + case Maxwell::CullFace::FrontAndBack: return vk::CullModeFlagBits::eFrontAndBack; } UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 7e9678b7b..24f6ab544 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -54,9 +54,9 @@ vk::BlendOp BlendEquation(Maxwell::Blend::Equation equation); vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor); -vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face); +vk::FrontFace FrontFace(Maxwell::FrontFace front_face); -vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face); +vk::CullModeFlags CullFace(Maxwell::CullFace cull_face); vk::ComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index ddc62bc97..42bb01418 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -27,6 +27,7 @@ #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" namespace Vulkan { @@ -177,10 +178,13 @@ bool RendererVulkan::Init() { swapchain = std::make_unique<VKSwapchain>(surface, *device); swapchain->Create(framebuffer.width, framebuffer.height, false); - scheduler = std::make_unique<VKScheduler>(*device, *resource_manager); + state_tracker = std::make_unique<StateTracker>(system); + + scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker); rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device, - *resource_manager, *memory_manager, *scheduler); + *resource_manager, *memory_manager, + *state_tracker, *scheduler); blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device, *resource_manager, *memory_manager, *swapchain, diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index f513397f0..3da08d2e4 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -4,8 +4,10 @@ #pragma once +#include <memory> #include <optional> #include <vector> + #include "video_core/renderer_base.h" #include "video_core/renderer_vulkan/declarations.h" @@ -15,6 +17,7 @@ class System; namespace Vulkan { +class StateTracker; class VKBlitScreen; class VKDevice; class VKFence; @@ -61,6 +64,7 @@ private: std::unique_ptr<VKSwapchain> swapchain; std::unique_ptr<VKMemoryManager> memory_manager; std::unique_ptr<VKResourceManager> resource_manager; + std::unique_ptr<StateTracker> state_tracker; std::unique_ptr<VKScheduler> scheduler; std::unique_ptr<VKBlitScreen> blit_screen; }; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 9d5b8de7a..60f57d83e 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -73,7 +73,7 @@ UniqueDescriptorUpdateTemplate VKComputePipeline::CreateDescriptorUpdateTemplate std::vector<vk::DescriptorUpdateTemplateEntry> template_entries; u32 binding = 0; u32 offset = 0; - FillDescriptorUpdateTemplateEntries(device, entries, binding, offset, template_entries); + FillDescriptorUpdateTemplateEntries(entries, binding, offset, template_entries); if (template_entries.empty()) { // If the shader doesn't use descriptor sets, skip template creation. return UniqueDescriptorUpdateTemplate{}; diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 886bde3b9..28d2fbc4f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; - features.shaderStorageImageReadWithoutFormat = - is_shader_storage_img_read_without_format_supported; + features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes"); } + vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback; + if (ext_transform_feedback) { + transform_feedback.transformFeedback = true; + transform_feedback.geometryStreams = true; + SetNext(next, transform_feedback); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks"); + } + if (!ext_depth_range_unrestricted) { LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); } @@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } }; - extensions.reserve(14); + extensions.reserve(15); extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami [[maybe_unused]] const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - bool khr_shader_float16_int8{}; - bool ext_subgroup_size_control{}; + bool has_khr_shader_float16_int8{}; + bool has_ext_subgroup_size_control{}; + bool has_ext_transform_feedback{}; for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { Test(extension, khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, + false); Test(extension, ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); Test(extension, ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + false); + Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); if (Settings::values.renderer_debug) { Test(extension, nv_device_diagnostic_checkpoints, @@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } } - if (khr_shader_float16_int8) { + if (has_khr_shader_float16_int8) { is_float16_supported = GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16; extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } - if (ext_subgroup_size_control) { + if (has_ext_subgroup_size_control) { const auto features = GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi); const auto properties = @@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami is_warp_potentially_bigger = true; } + if (has_ext_transform_feedback) { + const auto features = + GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi); + const auto properties = + GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi); + + if (features.transformFeedback && features.geometryStreams && + properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers && + properties.transformFeedbackQueries && properties.transformFeedbackDraw) { + extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME); + ext_transform_feedback = true; + } + } + return extensions; } @@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; - is_shader_storage_img_read_without_format_supported = - supported_features.shaderStorageImageReadWithoutFormat; + is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eR32G32Sfloat, vk::Format::eR32G32Uint, vk::Format::eR16G16B16A16Uint, + vk::Format::eR16G16B16A16Snorm, vk::Format::eR16G16B16A16Unorm, vk::Format::eR16G16Unorm, vk::Format::eR16G16Snorm, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 2c27ad730..6e656517f 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,11 +122,6 @@ public: return properties.limits.maxPushConstantsSize; } - /// Returns true if Shader storage Image Read Without Format supported. - bool IsShaderStorageImageReadWithoutFormatSupported() const { - return is_shader_storage_img_read_without_format_supported; - } - /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -147,6 +142,11 @@ public: return (guest_warp_stages & stage) != vk::ShaderStageFlags{}; } + /// Returns true if formatless image load is supported. + bool IsFormatlessImageLoadSupported() const { + return is_formatless_image_load_supported; + } + /// Returns true if the device supports VK_EXT_scalar_block_layout. bool IsKhrUniformBufferStandardLayoutSupported() const { return khr_uniform_buffer_standard_layout; @@ -167,6 +167,11 @@ public: return ext_shader_viewport_index_layer; } + /// Returns true if the device supports VK_EXT_transform_feedback. + bool IsExtTransformFeedbackSupported() const { + return ext_transform_feedback; + } + /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints. bool IsNvDeviceDiagnosticCheckpoints() const { return nv_device_diagnostic_checkpoints; @@ -214,26 +219,26 @@ private: static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties( const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical); - const vk::PhysicalDevice physical; ///< Physical device. - vk::DispatchLoaderDynamic dld; ///< Device function pointers. - vk::PhysicalDeviceProperties properties; ///< Device properties. - UniqueDevice logical; ///< Logical device. - vk::Queue graphics_queue; ///< Main graphics queue. - vk::Queue present_queue; ///< Main present queue. - u32 graphics_family{}; ///< Main graphics queue family index. - u32 present_family{}; ///< Main present queue family index. - vk::DriverIdKHR driver_id{}; ///< Driver ID. - vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. - bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + const vk::PhysicalDevice physical; ///< Physical device. + vk::DispatchLoaderDynamic dld; ///< Device function pointers. + vk::PhysicalDeviceProperties properties; ///< Device properties. + UniqueDevice logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + vk::DriverIdKHR driver_id{}; ///< Driver ID. + vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. + bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. - bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage - ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index b155dfb49..6a02403c1 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -97,8 +97,7 @@ UniqueDescriptorUpdateTemplate VKGraphicsPipeline::CreateDescriptorUpdateTemplat u32 offset = 0; for (const auto& stage : program) { if (stage) { - FillDescriptorUpdateTemplateEntries(device, stage->entries, binding, offset, - template_entries); + FillDescriptorUpdateTemplateEntries(stage->entries, binding, offset, template_entries); } } if (template_entries.empty()) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7ddf7d3ee..557b9d662 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -36,6 +36,13 @@ using Tegra::Engines::ShaderType; namespace { +// C++20's using enum +constexpr auto eUniformBuffer = vk::DescriptorType::eUniformBuffer; +constexpr auto eStorageBuffer = vk::DescriptorType::eStorageBuffer; +constexpr auto eUniformTexelBuffer = vk::DescriptorType::eUniformTexelBuffer; +constexpr auto eCombinedImageSampler = vk::DescriptorType::eCombinedImageSampler; +constexpr auto eStorageImage = vk::DescriptorType::eStorageImage; + constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ VideoCommon::Shader::CompileDepth::FullDecompile}; @@ -119,23 +126,32 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) { } } +template <vk::DescriptorType descriptor_type, class Container> +void AddBindings(std::vector<vk::DescriptorSetLayoutBinding>& bindings, u32& binding, + vk::ShaderStageFlags stage_flags, const Container& container) { + const u32 num_entries = static_cast<u32>(std::size(container)); + for (std::size_t i = 0; i < num_entries; ++i) { + u32 count = 1; + if constexpr (descriptor_type == eCombinedImageSampler) { + // Combined image samplers can be arrayed. + count = container[i].Size(); + } + bindings.emplace_back(binding++, descriptor_type, count, stage_flags, nullptr); + } +} + u32 FillDescriptorLayout(const ShaderEntries& entries, std::vector<vk::DescriptorSetLayoutBinding>& bindings, Maxwell::ShaderProgram program_type, u32 base_binding) { const ShaderType stage = GetStageFromProgram(program_type); - const vk::ShaderStageFlags stage_flags = MaxwellToVK::ShaderStage(stage); + const vk::ShaderStageFlags flags = MaxwellToVK::ShaderStage(stage); u32 binding = base_binding; - const auto AddBindings = [&](vk::DescriptorType descriptor_type, std::size_t num_entries) { - for (std::size_t i = 0; i < num_entries; ++i) { - bindings.emplace_back(binding++, descriptor_type, 1, stage_flags, nullptr); - } - }; - AddBindings(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size()); - AddBindings(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size()); - AddBindings(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size()); - AddBindings(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size()); - AddBindings(vk::DescriptorType::eStorageImage, entries.images.size()); + AddBindings<eUniformBuffer>(bindings, binding, flags, entries.const_buffers); + AddBindings<eStorageBuffer>(bindings, binding, flags, entries.global_buffers); + AddBindings<eUniformTexelBuffer>(bindings, binding, flags, entries.texel_buffers); + AddBindings<eCombinedImageSampler>(bindings, binding, flags, entries.samplers); + AddBindings<eStorageImage>(bindings, binding, flags, entries.images); return binding; } @@ -145,8 +161,8 @@ CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stag GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr, ProgramCode program_code, u32 main_offset) : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr}, - program_code{std::move(program_code)}, locker{stage, GetEngine(system, stage)}, - shader_ir{this->program_code, main_offset, compiler_settings, locker}, + program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)}, + shader_ir{this->program_code, main_offset, compiler_settings, registry}, entries{GenerateShaderEntries(shader_ir)} {} CachedShader::~CachedShader() = default; @@ -163,24 +179,19 @@ Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache) : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache(device) {} + renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const auto& gpu = system.GPU().Maxwell3D(); - auto& dirty = system.GPU().Maxwell3D().dirty.shaders; - if (!dirty) { - return last_shaders; - } - dirty = false; std::array<Shader, Maxwell::MaxShaderProgram> shaders; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - const auto& shader_config = gpu.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled @@ -262,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach specialization.workgroup_size = key.workgroup_size; specialization.shared_memory_size = key.shared_memory_size; - const SPIRVShader spirv_shader{ - Decompile(device, shader->GetIR(), ShaderType::Compute, specialization), - shader->GetEntries()}; + const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute, + shader->GetRegistry(), specialization), + shader->GetEntries()}; entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool, update_descriptor_queue, spirv_shader); return *entry; @@ -313,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const auto& gpu = system.GPU().Maxwell3D(); Specialization specialization; - specialization.primitive_topology = fixed_state.input_assembly.topology; - if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) { + if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) { ASSERT(fixed_state.input_assembly.point_size != 0.0f); specialization.point_size = fixed_state.input_assembly.point_size; } @@ -322,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type; } specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; - specialization.tessellation.primitive = fixed_state.tessellation.primitive; - specialization.tessellation.spacing = fixed_state.tessellation.spacing; - specialization.tessellation.clockwise = fixed_state.tessellation.clockwise; SPIRVProgram program; std::vector<vk::DescriptorSetLayoutBinding> bindings; @@ -345,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 const auto program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); - program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization), - entries}; + program[stage] = { + Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), + entries}; if (program_enum == Maxwell::ShaderProgram::VertexA) { // VertexB was combined with VertexA, so we skip the VertexB iteration @@ -361,32 +369,45 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { return {std::move(program), std::move(bindings)}; } -void FillDescriptorUpdateTemplateEntries( - const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset, - std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) { - static constexpr auto entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry)); - const auto AddEntry = [&](vk::DescriptorType descriptor_type, std::size_t count_) { - const u32 count = static_cast<u32>(count_); - if (descriptor_type == vk::DescriptorType::eUniformTexelBuffer && - device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) { - // Nvidia has a bug where updating multiple uniform texels at once causes the driver to - // crash. - for (u32 i = 0; i < count; ++i) { - template_entries.emplace_back(binding + i, 0, 1, descriptor_type, - offset + i * entry_size, entry_size); - } - } else if (count != 0) { - template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size); +template <vk::DescriptorType descriptor_type, class Container> +void AddEntry(std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries, u32& binding, + u32& offset, const Container& container) { + static constexpr u32 entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry)); + const u32 count = static_cast<u32>(std::size(container)); + + if constexpr (descriptor_type == eCombinedImageSampler) { + for (u32 i = 0; i < count; ++i) { + const u32 num_samplers = container[i].Size(); + template_entries.emplace_back(binding, 0, num_samplers, descriptor_type, offset, + entry_size); + ++binding; + offset += num_samplers * entry_size; } - offset += count * entry_size; - binding += count; - }; + return; + } - AddEntry(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size()); - AddEntry(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size()); - AddEntry(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size()); - AddEntry(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size()); - AddEntry(vk::DescriptorType::eStorageImage, entries.images.size()); + if constexpr (descriptor_type == eUniformTexelBuffer) { + // Nvidia has a bug where updating multiple uniform texels at once causes the driver to + // crash. + for (u32 i = 0; i < count; ++i) { + template_entries.emplace_back(binding + i, 0, 1, descriptor_type, + offset + i * entry_size, entry_size); + } + } else if (count > 0) { + template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size); + } + offset += count * entry_size; + binding += count; +} + +void FillDescriptorUpdateTemplateEntries( + const ShaderEntries& entries, u32& binding, u32& offset, + std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) { + AddEntry<eUniformBuffer>(template_entries, offset, binding, entries.const_buffers); + AddEntry<eStorageBuffer>(template_entries, offset, binding, entries.global_buffers); + AddEntry<eUniformTexelBuffer>(template_entries, offset, binding, entries.texel_buffers); + AddEntry<eCombinedImageSampler>(template_entries, offset, binding, entries.samplers); + AddEntry<eStorageImage>(template_entries, offset, binding, entries.images); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 8678fc9c3..c4c112290 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -25,7 +25,7 @@ #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" #include "video_core/surface.h" @@ -132,6 +132,10 @@ public: return shader_ir; } + const VideoCommon::Shader::Registry& GetRegistry() const { + return registry; + } + const VideoCommon::Shader::ShaderIR& GetIR() const { return shader_ir; } @@ -147,7 +151,7 @@ private: GPUVAddr gpu_addr{}; VAddr cpu_addr{}; ProgramCode program_code; - VideoCommon::Shader::ConstBufferLocker locker; + VideoCommon::Shader::Registry registry; VideoCommon::Shader::ShaderIR shader_ir; ShaderEntries entries; }; @@ -157,7 +161,8 @@ public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache); ~VKPipelineCache(); std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); @@ -180,8 +185,7 @@ private: VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; - - VKRenderPassCache renderpass_cache; + VKRenderPassCache& renderpass_cache; std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; @@ -194,7 +198,7 @@ private: }; void FillDescriptorUpdateTemplateEntries( - const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset, + const ShaderEntries& entries, u32& binding, u32& offset, std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 3bf86da87..58c69b786 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,6 +36,7 @@ #include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -105,17 +106,20 @@ void TransitionImages(const std::vector<ImageView>& views, vk::PipelineStageFlag template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - std::size_t stage) { + std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); if (entry.IsBindless()) { const Tegra::Texture::TextureHandle tex_handle = engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset()); return engine.GetTextureInfo(tex_handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); + const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); + const u32 offset = entry.GetOffset() + entry_offset; if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(stage_type, entry.GetOffset()); + return engine.GetStageTexture(stage_type, offset); } else { - return engine.GetTexture(entry.GetOffset()); + return engine.GetTexture(offset); } } @@ -277,17 +281,19 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf, RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer, VKScreenInfo& screen_info, const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler) + VKMemoryManager& memory_manager, StateTracker& state_tracker, + VKScheduler& scheduler) : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer}, screen_info{screen_info}, device{device}, resource_manager{resource_manager}, - memory_manager{memory_manager}, scheduler{scheduler}, + memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, staging_pool(device, memory_manager, scheduler), descriptor_pool(device), - update_descriptor_queue(device, scheduler), + update_descriptor_queue(device, scheduler), renderpass_cache(device), quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, staging_pool), - pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), + pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue, + renderpass_cache), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), sampler_cache(device), query_cache(system, *this, device, scheduler) { scheduler.SetQueryCache(query_cache); @@ -342,6 +348,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); }); } + BeginTransformFeedback(); + const auto pipeline_layout = pipeline.GetLayout(); const auto descriptor_set = pipeline.CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) { @@ -351,18 +359,23 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { } draw_params.Draw(cmdbuf, dld); }); + + EndTransformFeedback(); } void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); - query_cache.UpdateCounters(); - const auto& gpu = system.GPU().Maxwell3D(); if (!system.GPU().Maxwell3D().ShouldExecute()) { return; } + sampled_views.clear(); + image_views.clear(); + + query_cache.UpdateCounters(); + const auto& regs = gpu.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; @@ -371,52 +384,54 @@ void RasterizerVulkan::Clear() { if (!use_color && !use_depth && !use_stencil) { return; } - // Clearing images requires to be out of a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - // TODO(Rodrigo): Implement clears rendering a quad or using beginning a renderpass. + [[maybe_unused]] const auto texceptions = UpdateAttachments(); + DEBUG_ASSERT(texceptions.none()); + SetupImageTransitions(0, color_attachments, zeta_attachment); - if (use_color) { - View color_view; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - color_view = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT.Value(), false); - } + const vk::RenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0)); + const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); + scheduler.RequestRenderpass({renderpass, framebuffer, {{0, 0}, render_area}, 0, nullptr}); - color_view->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); + const auto& scissor = regs.scissor_test[0]; + const vk::Offset2D scissor_offset(scissor.min_x, scissor.min_y); + vk::Extent2D scissor_extent{scissor.max_x - scissor.min_x, scissor.max_y - scissor.min_y}; + scissor_extent.width = std::min(scissor_extent.width, render_area.width); + scissor_extent.height = std::min(scissor_extent.height, render_area.height); + const u32 layer = regs.clear_buffers.layer; + const vk::ClearRect clear_rect({scissor_offset, scissor_extent}, layer, 1); + + if (use_color) { const std::array clear_color = {regs.clear_color[0], regs.clear_color[1], regs.clear_color[2], regs.clear_color[3]}; - const vk::ClearColorValue clear(clear_color); - scheduler.Record([image = color_view->GetImage(), - subresource = color_view->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearColorImage(image, vk::ImageLayout::eTransferDstOptimal, clear, subresource, - dld); + const vk::ClearValue clear_value{clear_color}; + const u32 color_attachment = regs.clear_buffers.RT; + scheduler.Record([color_attachment, clear_value, clear_rect](auto cmdbuf, auto& dld) { + const vk::ClearAttachment attachment(vk::ImageAspectFlagBits::eColor, color_attachment, + clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); }); } - if (use_depth || use_stencil) { - View zeta_surface; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - zeta_surface = texture_cache.GetDepthBufferSurface(false); - } - zeta_surface->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); - - const vk::ClearDepthStencilValue clear(regs.clear_depth, - static_cast<u32>(regs.clear_stencil)); - scheduler.Record([image = zeta_surface->GetImage(), - subresource = zeta_surface->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearDepthStencilImage(image, vk::ImageLayout::eTransferDstOptimal, clear, - subresource, dld); - }); + if (!use_depth && !use_stencil) { + return; + } + vk::ImageAspectFlags aspect_flags; + if (use_depth) { + aspect_flags |= vk::ImageAspectFlagBits::eDepth; + } + if (use_stencil) { + aspect_flags |= vk::ImageAspectFlagBits::eStencil; } + + scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, + clear_rect, aspect_flags](auto cmdbuf, auto& dld) { + const vk::ClearDepthStencilValue clear_zeta(clear_depth, clear_stencil); + const vk::ClearValue clear_value{clear_zeta}; + const vk::ClearAttachment attachment(aspect_flags, 0, clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); + }); } void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { @@ -533,8 +548,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, // Verify that the cached surface is the same size and format as the requested framebuffer const auto& params{surface->GetSurfaceParams()}; - const auto& pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); @@ -545,6 +558,10 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } +void RasterizerVulkan::SetupDirtyFlags() { + state_tracker.Initialize(); +} + void RasterizerVulkan::FlushWork() { static constexpr u32 DRAWS_TO_DISPATCH = 4096; @@ -568,9 +585,9 @@ void RasterizerVulkan::FlushWork() { RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { MICROPROFILE_SCOPE(Vulkan_RenderTargets); - auto& dirty = system.GPU().Maxwell3D().dirty; - const bool update_rendertargets = dirty.render_settings; - dirty.render_settings = false; + auto& dirty = system.GPU().Maxwell3D().dirty.flags; + const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; + dirty[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); @@ -720,13 +737,51 @@ void RasterizerVulkan::SetupImageTransitions( } void RasterizerVulkan::UpdateDynamicStates() { - auto& gpu = system.GPU().Maxwell3D(); - UpdateViewportsState(gpu); - UpdateScissorsState(gpu); - UpdateDepthBias(gpu); - UpdateBlendConstants(gpu); - UpdateDepthBounds(gpu); - UpdateStencilFaces(gpu); + auto& regs = system.GPU().Maxwell3D().regs; + UpdateViewportsState(regs); + UpdateScissorsState(regs); + UpdateDepthBias(regs); + UpdateBlendConstants(regs); + UpdateDepthBounds(regs); + UpdateStencilFaces(regs); +} + +void RasterizerVulkan::BeginTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); + + UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable); + + const auto& binding = regs.tfb_bindings[0]; + UNIMPLEMENTED_IF(binding.buffer_enable == 0); + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + + scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) { + cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld); + cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld); + }); +} + +void RasterizerVulkan::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + scheduler.Record( + [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); }); } void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, @@ -836,14 +891,16 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std:: MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.samplers) { - const auto texture = GetTextureInfo(gpu, entry, stage); - SetupTexture(texture, entry); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(gpu, entry, stage, i); + SetupTexture(texture, entry); + } } } void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().KeplerCompute(); + const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.images) { const auto tic = GetTextureInfo(gpu, entry, stage).tic; SetupImage(tic, entry); @@ -886,8 +943,10 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().KeplerCompute(); for (const auto& entry : entries.samplers) { - const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex); - SetupTexture(texture, entry); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i); + SetupTexture(texture, entry); + } } } @@ -902,6 +961,13 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) { + if (!buffer.enabled) { + // Set values to zero to unbind buffers + update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, + sizeof(float)); + return; + } + // Align the size to avoid bad std140 interactions const std::size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); @@ -972,12 +1038,10 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima image_views.push_back(ImageView{std::move(view), image_layout}); } -void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.viewport_transform && scheduler.TouchViewports()) { +void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchViewports()) { return; } - gpu.dirty.viewport_transform = false; - const auto& regs = gpu.regs; const std::array viewports{ GetViewportState(device, regs, 0), GetViewportState(device, regs, 1), GetViewportState(device, regs, 2), GetViewportState(device, regs, 3), @@ -992,12 +1056,10 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu) { }); } -void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.scissor_test && scheduler.TouchScissors()) { +void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchScissors()) { return; } - gpu.dirty.scissor_test = false; - const auto& regs = gpu.regs; const std::array scissors = { GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), @@ -1010,46 +1072,39 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu) { }); } -void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.polygon_offset && scheduler.TouchDepthBias()) { +void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthBias()) { return; } - gpu.dirty.polygon_offset = false; - const auto& regs = gpu.regs; scheduler.Record([constant = regs.polygon_offset_units, clamp = regs.polygon_offset_clamp, factor = regs.polygon_offset_factor](auto cmdbuf, auto& dld) { cmdbuf.setDepthBias(constant, clamp, factor / 2.0f, dld); }); } -void RasterizerVulkan::UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.blend_state && scheduler.TouchBlendConstants()) { +void RasterizerVulkan::UpdateBlendConstants(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchBlendConstants()) { return; } - gpu.dirty.blend_state = false; - const std::array blend_color = {gpu.regs.blend_color.r, gpu.regs.blend_color.g, - gpu.regs.blend_color.b, gpu.regs.blend_color.a}; + const std::array blend_color = {regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, + regs.blend_color.a}; scheduler.Record([blend_color](auto cmdbuf, auto& dld) { cmdbuf.setBlendConstants(blend_color.data(), dld); }); } -void RasterizerVulkan::UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.depth_bounds_values && scheduler.TouchDepthBounds()) { +void RasterizerVulkan::UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthBounds()) { return; } - gpu.dirty.depth_bounds_values = false; - const auto& regs = gpu.regs; scheduler.Record([min = regs.depth_bounds[0], max = regs.depth_bounds[1]]( auto cmdbuf, auto& dld) { cmdbuf.setDepthBounds(min, max, dld); }); } -void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.stencil_test && scheduler.TouchStencilValues()) { +void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchStencilProperties()) { return; } - gpu.dirty.stencil_test = false; - const auto& regs = gpu.regs; if (regs.stencil_two_side_enable) { // Separate values per face scheduler.Record( @@ -1100,7 +1155,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { // This implementation assumes that all attributes are used in the shader. const GPUVAddr start{regs.vertex_array[index].StartAddress()}; const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end > start); + DEBUG_ASSERT(end >= start); size += (end - start + 1) * regs.vertex_array[index].enable; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4dc8af6e8..3185868e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -96,6 +96,7 @@ struct hash<Vulkan::FramebufferCacheKey> { namespace Vulkan { +class StateTracker; class BufferBindings; struct ImageView { @@ -108,7 +109,7 @@ public: explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, VKScreenInfo& screen_info, const VKDevice& device, VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKScheduler& scheduler); + StateTracker& state_tracker, VKScheduler& scheduler); ~RasterizerVulkan() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -127,6 +128,7 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; + void SetupDirtyFlags() override; /// Maximum supported size that a constbuffer can have in bytes. static constexpr std::size_t MaxConstbufferSize = 0x10000; @@ -167,6 +169,10 @@ private: void UpdateDynamicStates(); + void BeginTransformFeedback(); + + void EndTransformFeedback(); + bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, @@ -215,12 +221,12 @@ private: void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); - void UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu); - void UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu); - void UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu); - void UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu); - void UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu); - void UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu); + void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateBlendConstants(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs); std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; @@ -241,11 +247,13 @@ private: const VKDevice& device; VKResourceManager& resource_manager; VKMemoryManager& memory_manager; + StateTracker& state_tracker; VKScheduler& scheduler; VKStagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; + VKRenderPassCache renderpass_cache; QuadArrayPass quad_array_pass; Uint8Pass uint8_pass; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 92bd6c344..b61d4fe63 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -2,6 +2,12 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <memory> +#include <mutex> +#include <optional> +#include <thread> +#include <utility> + #include "common/assert.h" #include "common/microprofile.h" #include "video_core/renderer_vulkan/declarations.h" @@ -9,6 +15,7 @@ #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" namespace Vulkan { @@ -29,9 +36,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf, last = nullptr; } -VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager) - : device{device}, resource_manager{resource_manager}, next_fence{ - &resource_manager.CommitFence()} { +VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, + StateTracker& state_tracker) + : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker}, + next_fence{&resource_manager.CommitFence()} { AcquireNewChunk(); AllocateNewContext(); worker_thread = std::thread(&VKScheduler::WorkerThread, this); @@ -157,12 +165,7 @@ void VKScheduler::AllocateNewContext() { void VKScheduler::InvalidateState() { state.graphics_pipeline = nullptr; - state.viewports = false; - state.scissors = false; - state.depth_bias = false; - state.blend_constants = false; - state.depth_bounds = false; - state.stencil_values = false; + state_tracker.InvalidateCommandBufferState(); } void VKScheduler::EndPendingOperations() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 62fd7858b..c7cc291c3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -17,6 +17,7 @@ namespace Vulkan { +class StateTracker; class VKDevice; class VKFence; class VKQueryCache; @@ -43,7 +44,8 @@ private: /// OpenGL-like operations on Vulkan command buffers. class VKScheduler { public: - explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager); + explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, + StateTracker& state_tracker); ~VKScheduler(); /// Sends the current execution context to the GPU. @@ -74,36 +76,6 @@ public: query_cache = &query_cache_; } - /// Returns true when viewports have been set in the current command buffer. - bool TouchViewports() { - return std::exchange(state.viewports, true); - } - - /// Returns true when scissors have been set in the current command buffer. - bool TouchScissors() { - return std::exchange(state.scissors, true); - } - - /// Returns true when depth bias have been set in the current command buffer. - bool TouchDepthBias() { - return std::exchange(state.depth_bias, true); - } - - /// Returns true when blend constants have been set in the current command buffer. - bool TouchBlendConstants() { - return std::exchange(state.blend_constants, true); - } - - /// Returns true when depth bounds have been set in the current command buffer. - bool TouchDepthBounds() { - return std::exchange(state.depth_bounds, true); - } - - /// Returns true when stencil values have been set in the current command buffer. - bool TouchStencilValues() { - return std::exchange(state.stencil_values, true); - } - /// Send work to a separate thread. template <typename T> void Record(T&& command) { @@ -217,6 +189,8 @@ private: const VKDevice& device; VKResourceManager& resource_manager; + StateTracker& state_tracker; + VKQueryCache* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; @@ -226,12 +200,6 @@ private: struct State { std::optional<vk::RenderPassBeginInfo> renderpass; vk::Pipeline graphics_pipeline; - bool viewports = false; - bool scissors = false; - bool depth_bias = false; - bool blend_constants = false; - bool depth_bounds = false; - bool stencil_values = false; } state; std::unique_ptr<CommandChunk> chunk; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 2da622d15..51ecb5567 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -5,7 +5,9 @@ #include <functional> #include <limits> #include <map> +#include <optional> #include <type_traits> +#include <unordered_map> #include <utility> #include <fmt/format.h> @@ -24,6 +26,7 @@ #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" namespace Vulkan { @@ -69,8 +72,9 @@ struct TexelBuffer { struct SampledImage { Id image_type{}; - Id sampled_image_type{}; - Id sampler{}; + Id sampler_type{}; + Id sampler_pointer_type{}; + Id variable{}; }; struct StorageImage { @@ -92,6 +96,12 @@ struct VertexIndices { std::optional<u32> clip_distances; }; +struct GenericVaryingDescription { + Id id = nullptr; + u32 first_element = 0; + bool is_scalar = false; +}; + spv::Dim GetSamplerDim(const Sampler& sampler) { ASSERT(!sampler.IsBuffer()); switch (sampler.GetType()) { @@ -265,9 +275,13 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler final : public Sirit::Module { public: explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, - const Specialization& specialization) + const Registry& registry, const Specialization& specialization) : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - specialization{specialization} { + registry{registry}, specialization{specialization} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + AddCapability(spv::Capability::Shader); AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess); AddCapability(spv::Capability::ImageQuery); @@ -285,6 +299,15 @@ public: AddExtension("SPV_KHR_variable_pointers"); AddExtension("SPV_KHR_shader_draw_parameters"); + if (!transform_feedback.empty()) { + if (device.IsExtTransformFeedbackSupported()) { + AddCapability(spv::Capability::TransformFeedback); + } else { + LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not " + "supported on this device"); + } + } + if (ir.UsesLayer() || ir.UsesViewportIndex()) { if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -295,7 +318,7 @@ public: } } - if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (device.IsFormatlessImageLoadSupported()) { AddCapability(spv::Capability::StorageImageReadWithoutFormat); } @@ -317,25 +340,29 @@ public: AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common2.threads_per_input_primitive); break; - case ShaderType::TesselationEval: + case ShaderType::TesselationEval: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Tessellation); AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive)); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing)); - AddExecutionMode(main, specialization.tessellation.clockwise + AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive)); + AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing)); + AddExecutionMode(main, info.tessellation_clockwise ? spv::ExecutionMode::VertexOrderCw : spv::ExecutionMode::VertexOrderCcw); break; - case ShaderType::Geometry: + } + case ShaderType::Geometry: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Geometry); AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology)); + AddExecutionMode(main, GetExecutionMode(info.primitive_topology)); AddExecutionMode(main, GetExecutionMode(header.common3.output_topology)); AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common4.max_output_vertices); // TODO(Rodrigo): Where can we get this info from? AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U); break; + } case ShaderType::Fragment: AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces); AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); @@ -544,7 +571,8 @@ private: if (stage != ShaderType::Geometry) { return; } - const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + const auto& info = registry.GetGraphicsInfo(); + const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology); DeclareInputVertexArray(num_input); DeclareOutputVertex(); } @@ -741,12 +769,34 @@ private: } void DeclareOutputAttributes() { + if (stage == ShaderType::Compute || stage == ShaderType::Fragment) { + return; + } + + UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex); for (const auto index : ir.GetOutputAttributes()) { if (!IsGenericAttribute(index)) { continue; } - const u32 location = GetGenericAttributeLocation(index); - Id type = t_float4; + DeclareOutputAttribute(index); + } + } + + void DeclareOutputAttribute(Attribute::Index index) { + static constexpr std::string_view swizzle = "xyzw"; + + const u32 location = GetGenericAttributeLocation(index); + u8 element = 0; + while (element < 4) { + const std::size_t remainder = 4 - element; + + std::size_t num_components = remainder; + const std::optional tfb = GetTransformFeedbackInfo(index, element); + if (tfb) { + num_components = tfb->components; + } + + Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1); Id varying_default = v_varying_default; if (IsOutputAttributeArray()) { const u32 num = GetNumOutputVertices(); @@ -759,15 +809,47 @@ private: } type = TypePointer(spv::StorageClass::Output, type); + std::string name = fmt::format("out_attr{}", location); + if (num_components < 4 || element > 0) { + name = fmt::format("{}_{}", name, swizzle.substr(element, num_components)); + } + const Id id = OpVariable(type, spv::StorageClass::Output, varying_default); - Name(AddGlobalVariable(id), fmt::format("out_attr{}", location)); - output_attributes.emplace(index, id); + Name(AddGlobalVariable(id), name); + + GenericVaryingDescription description; + description.id = id; + description.first_element = element; + description.is_scalar = num_components == 1; + for (u32 i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i); + output_attributes.emplace(offset, description); + } interfaces.push_back(id); Decorate(id, spv::Decoration::Location, location); + if (element > 0) { + Decorate(id, spv::Decoration::Component, static_cast<u32>(element)); + } + if (tfb && device.IsExtTransformFeedbackSupported()) { + Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer)); + Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride)); + Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset)); + } + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); } } + std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + return it->second; + } + u32 DeclareConstantBuffers(u32 binding) { for (const auto& [index, size] : ir.GetConstantBuffers()) { const Id type = device.IsKhrUniformBufferStandardLayoutSupported() ? t_cbuf_scalar_ubo @@ -833,16 +915,20 @@ private: constexpr int sampled = 1; constexpr auto format = spv::ImageFormat::Unknown; const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format); - const Id sampled_image_type = TypeSampledImage(image_type); - const Id pointer_type = - TypePointer(spv::StorageClass::UniformConstant, sampled_image_type); + const Id sampler_type = TypeSampledImage(image_type); + const Id sampler_pointer_type = + TypePointer(spv::StorageClass::UniformConstant, sampler_type); + const Id type = sampler.IsIndexed() + ? TypeArray(sampler_type, Constant(t_uint, sampler.Size())) + : sampler_type; + const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type); const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex()))); Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - sampled_images.emplace(sampler.GetIndex(), - SampledImage{image_type, sampled_image_type, id}); + sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type, + sampler_pointer_type, id}); } return binding; } @@ -893,7 +979,7 @@ private: u32 GetNumInputVertices() const { switch (stage) { case ShaderType::Geometry: - return GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology); case ShaderType::TesselationControl: case ShaderType::TesselationEval: return NumInputPatches; @@ -1341,8 +1427,14 @@ private: } default: if (IsGenericAttribute(attribute)) { - const Id composite = output_attributes.at(attribute); - return {ArrayPass(t_out_float, composite, {element}), Type::Float}; + const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element); + const GenericVaryingDescription description = output_attributes.at(offset); + const Id composite = description.id; + std::vector<u32> indices; + if (!description.is_scalar) { + indices.push_back(element - description.first_element); + } + return {ArrayPass(t_out_float, composite, indices), Type::Float}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); @@ -1525,7 +1617,12 @@ private: ASSERT(!meta.sampler.IsBuffer()); const auto& entry = sampled_images.at(meta.sampler.GetIndex()); - return OpLoad(entry.sampled_image_type, entry.sampler); + Id sampler = entry.variable; + if (meta.sampler.IsIndexed()) { + const Id index = AsInt(Visit(meta.index)); + sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index); + } + return OpLoad(entry.sampler_type, sampler); } Id GetTextureImage(Operation operation) { @@ -1783,7 +1880,7 @@ private: } Expression ImageLoad(Operation operation) { - if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (!device.IsFormatlessImageLoadSupported()) { return {v_float_zero, Type::Float}; } @@ -2211,16 +2308,14 @@ private: switch (specialization.attribute_types.at(location)) { case Maxwell::VertexAttribute::Type::SignedNorm: case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::SignedScaled: case Maxwell::VertexAttribute::Type::Float: return {Type::Float, t_in_float, t_in_float4}; case Maxwell::VertexAttribute::Type::SignedInt: return {Type::Int, t_in_int, t_in_int4}; case Maxwell::VertexAttribute::Type::UnsignedInt: return {Type::Uint, t_in_uint, t_in_uint4}; - case Maxwell::VertexAttribute::Type::UnsignedScaled: - case Maxwell::VertexAttribute::Type::SignedScaled: - UNIMPLEMENTED(); - return {Type::Float, t_in_float, t_in_float4}; default: UNREACHABLE(); return {Type::Float, t_in_float, t_in_float4}; @@ -2250,11 +2345,11 @@ private: std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const { switch (type) { case Type::Float: - return {nullptr, t_float2, t_float3, t_float4}; + return {t_float, t_float2, t_float3, t_float4}; case Type::Int: - return {nullptr, t_int2, t_int3, t_int4}; + return {t_int, t_int2, t_int3, t_int4}; case Type::Uint: - return {nullptr, t_uint2, t_uint3, t_uint4}; + return {t_uint, t_uint2, t_uint3, t_uint4}; default: UNIMPLEMENTED(); return {}; @@ -2487,7 +2582,9 @@ private: const ShaderIR& ir; const ShaderType stage; const Tegra::Shader::Header header; + const Registry& registry; const Specialization& specialization; + std::unordered_map<u8, VaryingTFB> transform_feedback; const Id t_void = Name(TypeVoid(), "void"); @@ -2576,7 +2673,7 @@ private: Id shared_memory{}; std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{}; std::map<Attribute::Index, Id> input_attributes; - std::map<Attribute::Index, Id> output_attributes; + std::unordered_map<u8, GenericVaryingDescription> output_attributes; std::map<u32, Id> constant_buffers; std::map<GlobalMemoryBase, Id> global_buffers; std::map<u32, TexelBuffer> texel_buffers; @@ -2862,8 +2959,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { } std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - ShaderType stage, const Specialization& specialization) { - return SPIRVDecompiler(device, ir, stage, specialization).Assemble(); + ShaderType stage, const VideoCommon::Shader::Registry& registry, + const Specialization& specialization) { + return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index f5dc14d9e..ffea4709e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -15,6 +15,7 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace Vulkan { @@ -91,17 +92,9 @@ struct Specialization final { u32 shared_memory_size{}; // Graphics specific - Maxwell::PrimitiveTopology primitive_topology{}; std::optional<float> point_size{}; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; - - // Tessellation specific - struct { - Maxwell::TessellationPrimitive primitive{}; - Maxwell::TessellationSpacing spacing{}; - bool clockwise{}; - } tessellation; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); @@ -114,6 +107,8 @@ struct SPIRVShader { ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - Tegra::Engines::ShaderType stage, const Specialization& specialization); + Tegra::Engines::ShaderType stage, + const VideoCommon::Shader::Registry& registry, + const Specialization& specialization); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 171d78afc..374959f82 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -73,7 +73,8 @@ VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) { const auto usage = vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eIndexBuffer; + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eIndexBuffer; const u32 log2 = Common::Log2Ceil64(size); const vk::BufferCreateInfo buffer_ci({}, 1ULL << log2, usage, vk::SharingMode::eExclusive, 0, nullptr); @@ -99,7 +100,6 @@ void VKStagingBufferPool::ReleaseCache(bool host_visible) { } u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t log2) { - static constexpr u64 epochs_to_destroy = 180; static constexpr std::size_t deletions_per_tick = 16; auto& staging = cache[log2]; @@ -107,6 +107,7 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo const std::size_t old_size = entries.size(); const auto is_deleteable = [this](const auto& entry) { + static constexpr u64 epochs_to_destroy = 180; return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed(); }; const std::size_t begin_offset = staging.delete_index; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp new file mode 100644 index 000000000..94a89e388 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -0,0 +1,99 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> +#include <iterator> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace Vulkan { + +namespace { + +using namespace Dirty; +using namespace VideoCommon::Dirty; +using Tegra::Engines::Maxwell3D; +using Regs = Maxwell3D::Regs; +using Tables = Maxwell3D::DirtyState::Tables; +using Table = Maxwell3D::DirtyState::Table; +using Flags = Maxwell3D::DirtyState::Flags; + +Flags MakeInvalidationFlags() { + Flags flags{}; + flags[Viewports] = true; + flags[Scissors] = true; + flags[DepthBias] = true; + flags[BlendConstants] = true; + flags[DepthBounds] = true; + flags[StencilProperties] = true; + return flags; +} + +void SetupDirtyViewports(Tables& tables) { + FillBlock(tables[0], OFF(viewport_transform), NUM(viewport_transform), Viewports); + FillBlock(tables[0], OFF(viewports), NUM(viewports), Viewports); + tables[0][OFF(viewport_transform_enabled)] = Viewports; +} + +void SetupDirtyScissors(Tables& tables) { + FillBlock(tables[0], OFF(scissor_test), NUM(scissor_test), Scissors); +} + +void SetupDirtyDepthBias(Tables& tables) { + auto& table = tables[0]; + table[OFF(polygon_offset_units)] = DepthBias; + table[OFF(polygon_offset_clamp)] = DepthBias; + table[OFF(polygon_offset_factor)] = DepthBias; +} + +void SetupDirtyBlendConstants(Tables& tables) { + FillBlock(tables[0], OFF(blend_color), NUM(blend_color), BlendConstants); +} + +void SetupDirtyDepthBounds(Tables& tables) { + FillBlock(tables[0], OFF(depth_bounds), NUM(depth_bounds), DepthBounds); +} + +void SetupDirtyStencilProperties(Tables& tables) { + auto& table = tables[0]; + table[OFF(stencil_two_side_enable)] = StencilProperties; + table[OFF(stencil_front_func_ref)] = StencilProperties; + table[OFF(stencil_front_mask)] = StencilProperties; + table[OFF(stencil_front_func_mask)] = StencilProperties; + table[OFF(stencil_back_func_ref)] = StencilProperties; + table[OFF(stencil_back_mask)] = StencilProperties; + table[OFF(stencil_back_func_mask)] = StencilProperties; +} + +} // Anonymous namespace + +StateTracker::StateTracker(Core::System& system) + : system{system}, invalidation_flags{MakeInvalidationFlags()} {} + +void StateTracker::Initialize() { + auto& dirty = system.GPU().Maxwell3D().dirty; + auto& tables = dirty.tables; + SetupDirtyRenderTargets(tables); + SetupDirtyViewports(tables); + SetupDirtyScissors(tables); + SetupDirtyDepthBias(tables); + SetupDirtyBlendConstants(tables); + SetupDirtyDepthBounds(tables); + SetupDirtyStencilProperties(tables); +} + +void StateTracker::InvalidateCommandBufferState() { + system.GPU().Maxwell3D().dirty.flags |= invalidation_flags; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h new file mode 100644 index 000000000..03bc415b2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -0,0 +1,79 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <limits> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" + +namespace Vulkan { + +namespace Dirty { + +enum : u8 { + First = VideoCommon::Dirty::LastCommonEntry, + + Viewports, + Scissors, + DepthBias, + BlendConstants, + DepthBounds, + StencilProperties, + + Last +}; +static_assert(Last <= std::numeric_limits<u8>::max()); + +} // namespace Dirty + +class StateTracker { +public: + explicit StateTracker(Core::System& system); + + void Initialize(); + + void InvalidateCommandBufferState(); + + bool TouchViewports() { + return Exchange(Dirty::Viewports, false); + } + + bool TouchScissors() { + return Exchange(Dirty::Scissors, false); + } + + bool TouchDepthBias() { + return Exchange(Dirty::DepthBias, false); + } + + bool TouchBlendConstants() { + return Exchange(Dirty::BlendConstants, false); + } + + bool TouchDepthBounds() { + return Exchange(Dirty::DepthBounds, false); + } + + bool TouchStencilProperties() { + return Exchange(Dirty::StencilProperties, false); + } + +private: + bool Exchange(std::size_t id, bool new_value) const noexcept { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + const bool is_dirty = flags[id]; + flags[id] = new_value; + return is_dirty; + } + + Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index f47b691a8..9e73fa9cd 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -141,11 +141,6 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities const vk::SurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats, srgb)}; const vk::PresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)}; - extent = ChooseSwapExtent(capabilities, width, height); - - current_width = extent.width; - current_height = extent.height; - current_srgb = srgb; u32 requested_image_count{capabilities.minImageCount + 1}; if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) { @@ -153,10 +148,9 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities } vk::SwapchainCreateInfoKHR swapchain_ci( - {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, - extent, 1, vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, - capabilities.currentTransform, vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, - {}); + {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, {}, 1, + vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, capabilities.currentTransform, + vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, {}); const u32 graphics_family{device.GetGraphicsFamily()}; const u32 present_family{device.GetPresentFamily()}; @@ -169,9 +163,18 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities swapchain_ci.imageSharingMode = vk::SharingMode::eExclusive; } + // Request the size again to reduce the possibility of a TOCTOU race condition. + const auto updated_capabilities = physical_device.getSurfaceCapabilitiesKHR(surface, dld); + swapchain_ci.imageExtent = ChooseSwapExtent(updated_capabilities, width, height); + // Don't add code within this and the swapchain creation. const auto dev{device.GetLogical()}; swapchain = dev.createSwapchainKHRUnique(swapchain_ci, nullptr, dld); + extent = swapchain_ci.imageExtent; + current_width = extent.width; + current_height = extent.height; + current_srgb = srgb; + images = dev.getSwapchainImagesKHR(*swapchain, dld); image_count = static_cast<u32>(images.size()); image_format = surface_format.format; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 51b0d38a6..26175921b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -22,6 +22,7 @@ #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/surface.h" @@ -51,6 +52,9 @@ vk::ImageType SurfaceTargetToImage(SurfaceTarget target) { return vk::ImageType::e2D; case SurfaceTarget::Texture3D: return vk::ImageType::e3D; + case SurfaceTarget::TextureBuffer: + UNREACHABLE(); + return {}; } UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); return {}; @@ -272,7 +276,6 @@ void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { for (u32 level = 0; level < params.num_levels; ++level) { vk::BufferImageCopy copy = GetBufferImageCopy(level); - const auto& dld = device.GetDispatchLoader(); if (image->GetAspectMask() == (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { vk::BufferImageCopy depth = copy; @@ -421,7 +424,6 @@ void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal); - const auto& dld{device.GetDispatchLoader()}; const vk::ImageSubresourceLayers src_subresource( src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers); const vk::ImageSubresourceLayers dst_subresource( @@ -457,7 +459,6 @@ void VKTextureCache::ImageBlit(View& src_view, View& dst_view, dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right}); const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - const auto& dld{device.GetDispatchLoader()}; scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, is_linear](auto cmdbuf, auto& dld) { cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp deleted file mode 100644 index 0638be8cb..000000000 --- a/src/video_core/shader/const_buffer_locker.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <tuple> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/engines/shader_type.h" -#include "video_core/shader/const_buffer_locker.h" - -namespace VideoCommon::Shader { - -using Tegra::Engines::SamplerDescriptor; - -ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage) - : stage{shader_stage} {} - -ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine) - : stage{shader_stage}, engine{&engine} {} - -ConstBufferLocker::~ConstBufferLocker() = default; - -std::optional<u32> ConstBufferLocker::ObtainKey(u32 buffer, u32 offset) { - const std::pair<u32, u32> key = {buffer, offset}; - const auto iter = keys.find(key); - if (iter != keys.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const u32 value = engine->AccessConstBuffer32(stage, buffer, offset); - keys.emplace(key, value); - return value; -} - -std::optional<SamplerDescriptor> ConstBufferLocker::ObtainBoundSampler(u32 offset) { - const u32 key = offset; - const auto iter = bound_samplers.find(key); - if (iter != bound_samplers.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset); - bound_samplers.emplace(key, value); - return value; -} - -std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindlessSampler( - u32 buffer, u32 offset) { - const std::pair key = {buffer, offset}; - const auto iter = bindless_samplers.find(key); - if (iter != bindless_samplers.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset); - bindless_samplers.emplace(key, value); - return value; -} - -std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() { - if (bound_buffer_saved) { - return bound_buffer; - } - if (!engine) { - return std::nullopt; - } - bound_buffer_saved = true; - bound_buffer = engine->GetBoundBuffer(); - return bound_buffer; -} - -void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) { - keys.insert_or_assign({buffer, offset}, value); -} - -void ConstBufferLocker::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) { - bound_samplers.insert_or_assign(offset, sampler); -} - -void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) { - bindless_samplers.insert_or_assign({buffer, offset}, sampler); -} - -void ConstBufferLocker::SetBoundBuffer(u32 buffer) { - bound_buffer_saved = true; - bound_buffer = buffer; -} - -bool ConstBufferLocker::IsConsistent() const { - if (!engine) { - return false; - } - return std::all_of(keys.begin(), keys.end(), - [this](const auto& pair) { - const auto [cbuf, offset] = pair.first; - const auto value = pair.second; - return value == engine->AccessConstBuffer32(stage, cbuf, offset); - }) && - std::all_of(bound_samplers.begin(), bound_samplers.end(), - [this](const auto& sampler) { - const auto [key, value] = sampler; - return value == engine->AccessBoundSampler(stage, key); - }) && - std::all_of(bindless_samplers.begin(), bindless_samplers.end(), - [this](const auto& sampler) { - const auto [cbuf, offset] = sampler.first; - const auto value = sampler.second; - return value == engine->AccessBindlessSampler(stage, cbuf, offset); - }); -} - -bool ConstBufferLocker::HasEqualKeys(const ConstBufferLocker& rhs) const { - return std::tie(keys, bound_samplers, bindless_samplers) == - std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers); -} - -} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h deleted file mode 100644 index d3ea11087..000000000 --- a/src/video_core/shader/const_buffer_locker.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <optional> -#include <unordered_map> -#include "common/common_types.h" -#include "common/hash.h" -#include "video_core/engines/const_buffer_engine_interface.h" -#include "video_core/engines/shader_type.h" -#include "video_core/guest_driver.h" - -namespace VideoCommon::Shader { - -using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; -using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; -using BindlessSamplerMap = - std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; - -/** - * The ConstBufferLocker is a class use to interface the 3D and compute engines with the shader - * compiler. with it, the shader can obtain required data from GPU state and store it for disk - * shader compilation. - */ -class ConstBufferLocker { -public: - explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage); - - explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine); - - ~ConstBufferLocker(); - - /// Retrieves a key from the locker, if it's registered, it will give the registered value, if - /// not it will obtain it from maxwell3d and register it. - std::optional<u32> ObtainKey(u32 buffer, u32 offset); - - std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); - - std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); - - std::optional<u32> ObtainBoundBuffer(); - - /// Inserts a key. - void InsertKey(u32 buffer, u32 offset, u32 value); - - /// Inserts a bound sampler key. - void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler); - - /// Inserts a bindless sampler key. - void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler); - - /// Set the bound buffer for this locker. - void SetBoundBuffer(u32 buffer); - - /// Checks keys and samplers against engine's current const buffers. Returns true if they are - /// the same value, false otherwise; - bool IsConsistent() const; - - /// Returns true if the keys are equal to the other ones in the locker. - bool HasEqualKeys(const ConstBufferLocker& rhs) const; - - /// Gives an getter to the const buffer keys in the database. - const KeyMap& GetKeys() const { - return keys; - } - - /// Gets samplers database. - const BoundSamplerMap& GetBoundSamplers() const { - return bound_samplers; - } - - /// Gets bindless samplers database. - const BindlessSamplerMap& GetBindlessSamplers() const { - return bindless_samplers; - } - - /// Gets bound buffer used on this shader - u32 GetBoundBuffer() const { - return bound_buffer; - } - - /// Obtains access to the guest driver's profile. - VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const { - if (engine) { - return &engine->AccessGuestDriverProfile(); - } - return nullptr; - } - -private: - const Tegra::Engines::ShaderType stage; - Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; - KeyMap keys; - BoundSamplerMap bound_samplers; - BindlessSamplerMap bindless_samplers; - bool bound_buffer_saved{}; - u32 bound_buffer{}; -}; - -} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 0229733b6..2e2711350 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -13,6 +13,7 @@ #include "common/common_types.h" #include "video_core/shader/ast.h" #include "video_core/shader/control_flow.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -64,11 +65,11 @@ struct BlockInfo { }; struct CFGRebuildState { - explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker) - : program_code{program_code}, locker{locker}, start{start} {} + explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry) + : program_code{program_code}, registry{registry}, start{start} {} const ProgramCode& program_code; - ConstBufferLocker& locker; + Registry& registry; u32 start{}; std::vector<BlockInfo> block_info; std::list<u32> inspect_queries; @@ -438,7 +439,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) const s32 pc_target = offset + result.relative_position; std::vector<CaseBranch> branches; for (u32 i = 0; i < result.entries; i++) { - auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4); + auto key = state.registry.ObtainKey(result.buffer, result.offset + i * 4); if (!key) { return {ParseResult::AbnormalFlow, parse_info}; } @@ -656,14 +657,14 @@ void DecompileShader(CFGRebuildState& state) { std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, const CompilerSettings& settings, - ConstBufferLocker& locker) { + Registry& registry) { auto result_out = std::make_unique<ShaderCharacteristics>(); if (settings.depth == CompileDepth::BruteForce) { result_out->settings.depth = CompileDepth::BruteForce; return result_out; } - CFGRebuildState state{program_code, start_address, locker}; + CFGRebuildState state{program_code, start_address, registry}; // Inspect Code and generate blocks state.labels.clear(); state.labels.emplace(start_address); diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index 5304998b9..62a3510d8 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -12,6 +12,7 @@ #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/ast.h" #include "video_core/shader/compiler_settings.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -111,6 +112,6 @@ struct ShaderCharacteristics { std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, const CompilerSettings& settings, - ConstBufferLocker& locker); + Registry& registry); } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 6b697ed5d..87ac9ac6c 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -34,13 +34,9 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { return (absolute_offset % SchedPeriod) == 0; } -void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, +void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { - if (gpu_driver == nullptr) { - LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet"); - return; - } - if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) { + if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) { return; } u32 count{}; @@ -53,17 +49,13 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, bound_offsets.emplace_back(sampler.GetOffset()); } if (count > 1) { - gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets)); + gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets)); } } std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, - VideoCore::GuestDriverProfile* gpu_driver, + VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { - if (gpu_driver == nullptr) { - LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet"); - return std::nullopt; - } const u32 base_offset = sampler_to_deduce.GetOffset(); u32 max_offset{std::numeric_limits<u32>::max()}; for (const auto& sampler : used_samplers) { @@ -77,7 +69,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, if (max_offset == std::numeric_limits<u32>::max()) { return std::nullopt; } - return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize(); + return ((max_offset - base_offset) * 4) / gpu_driver.GetTextureHandlerSize(); } } // Anonymous namespace @@ -149,7 +141,7 @@ void ShaderIR::Decode() { std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); decompiled = false; - auto info = ScanFlow(program_code, main_offset, settings, locker); + auto info = ScanFlow(program_code, main_offset, settings, registry); auto& shader_info = *info; coverage_begin = shader_info.start; coverage_end = shader_info.end; @@ -364,7 +356,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) { void ShaderIR::PostDecode() { // Deduce texture handler size if needed - auto gpu_driver = locker.AccessGuestDriverProfile(); + auto gpu_driver = registry.AccessGuestDriverProfile(); DeduceTextureHandlerSize(gpu_driver, used_samplers); // Deduce Indexed Samplers if (!uses_indexed_samplers) { diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp index e02bcd097..8e3b46e8e 100644 --- a/src/video_core/shader/decode/bfe.cpp +++ b/src/video_core/shader/decode/bfe.cpp @@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.bfe.negate_b); - Node op_a = GetRegister(instr.gpr8); - op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); - - switch (opcode->get().GetId()) { - case OpCode::Id::BFE_IMM: { - UNIMPLEMENTED_IF_MSG(instr.generates_cc, - "Condition codes generation in BFE is not implemented"); + Node op_b = [&] { + switch (opcode->get().GetId()) { + case OpCode::Id::BFE_R: + return GetRegister(instr.gpr20); + case OpCode::Id::BFE_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::BFE_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); - const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); - const Node outer_shift_imm = - Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position)); + UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented"); - const Node inner_shift = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm); - const Node outer_shift = - Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm); + const bool is_signed = instr.bfe.is_signed; - SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); - SetRegister(bb, instr.gpr0, outer_shift); - break; - } - default: - UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); + // using reverse parallel method in + // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + // note for later if possible to implement faster method. + if (instr.bfe.brev) { + const auto swap = [&](u32 s, u32 mask) { + Node v1 = + SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s)); + if (mask != 0) { + v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1), + Immediate(mask)); + } + Node v2 = op_a; + if (mask != 0) { + v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2), + Immediate(mask)); + } + v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2), + Immediate(s)); + return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1), + std::move(v2)); + }; + op_a = swap(1, 0x55555555U); + op_a = swap(2, 0x33333333U); + op_a = swap(4, 0x0F0F0F0FU); + op_a = swap(8, 0x00FF00FFU); + op_a = swap(16, 0); } + const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(0), Immediate(8)); + const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(8), Immediate(8)); + auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits); + SetRegister(bb, instr.gpr0, std::move(result)); + return pc; } diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index bee7d8cad..48350e042 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -12,6 +12,7 @@ #include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/node_helper.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -359,8 +360,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample if (sampler_info) { return *sampler_info; } - const auto sampler = - buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset); + const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) + : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); return SamplerInfo{TextureType::Texture2D, false, false, false}; diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index 206961909..fbd7e9a17 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -12,6 +12,7 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::PredCondition; u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; @@ -63,15 +64,18 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { } }(); - op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16); + op_a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(op_a), + instr.xmad.high_a ? Immediate(16) : Immediate(0), Immediate(16)); const Node original_b = op_b; - op_b = BitfieldExtract(op_b, is_high_b ? 16 : 0, 16); + op_b = SignedOperation(OperationCode::IBitfieldExtract, is_signed_b, std::move(op_b), + is_high_b ? Immediate(16) : Immediate(0), Immediate(16)); - // TODO(Rodrigo): Use an appropiate sign for this operation - Node product = Operation(OperationCode::IMul, NO_PRECISE, op_a, op_b); + // we already check sign_a and sign_b is difference or not before so just use one in here. + Node product = SignedOperation(OperationCode::IMul, is_signed_a, op_a, op_b); if (is_psl) { - product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16)); + product = + SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_a, product, Immediate(16)); } SetTemporary(bb, 0, product); product = GetTemporary(0); @@ -88,12 +92,40 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { return BitfieldExtract(original_c, 16, 16); case Tegra::Shader::XmadMode::CBcc: { const Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, - NO_PRECISE, original_b, Immediate(16)); - return SignedOperation(OperationCode::IAdd, is_signed_c, NO_PRECISE, original_c, - shifted_b); + original_b, Immediate(16)); + return SignedOperation(OperationCode::IAdd, is_signed_c, original_c, shifted_b); + } + case Tegra::Shader::XmadMode::CSfu: { + const Node comp_a = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_a, + op_a, Immediate(0)); + const Node comp_b = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_b, + op_b, Immediate(0)); + const Node comp = Operation(OperationCode::LogicalOr, comp_a, comp_b); + + const Node comp_minus_a = GetPredicateComparisonInteger( + PredCondition::NotEqual, is_signed_a, + SignedOperation(OperationCode::IBitwiseAnd, is_signed_a, op_a, + Immediate(0x80000000)), + Immediate(0)); + const Node comp_minus_b = GetPredicateComparisonInteger( + PredCondition::NotEqual, is_signed_b, + SignedOperation(OperationCode::IBitwiseAnd, is_signed_b, op_b, + Immediate(0x80000000)), + Immediate(0)); + + Node new_c = Operation( + OperationCode::Select, comp_minus_a, + SignedOperation(OperationCode::IAdd, is_signed_c, original_c, Immediate(-65536)), + original_c); + new_c = Operation( + OperationCode::Select, comp_minus_b, + SignedOperation(OperationCode::IAdd, is_signed_c, new_c, Immediate(-65536)), + std::move(new_c)); + + return Operation(OperationCode::Select, comp, original_c, std::move(new_c)); } default: - UNIMPLEMENTED_MSG("Unhandled XMAD mode: {}", static_cast<u32>(instr.xmad.mode.Value())); + UNREACHABLE(); return Immediate(0); } }(); @@ -102,18 +134,19 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { op_c = GetTemporary(1); // TODO(Rodrigo): Use an appropiate sign for this operation - Node sum = Operation(OperationCode::IAdd, product, op_c); + Node sum = SignedOperation(OperationCode::IAdd, is_signed_a, product, std::move(op_c)); SetTemporary(bb, 2, sum); sum = GetTemporary(2); if (is_merge) { - const Node a = BitfieldExtract(sum, 0, 16); - const Node b = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, original_b, Immediate(16)); - sum = Operation(OperationCode::IBitwiseOr, NO_PRECISE, a, b); + const Node a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(sum), + Immediate(0), Immediate(16)); + const Node b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, original_b, + Immediate(16)); + sum = SignedOperation(OperationCode::IBitwiseOr, is_signed_a, a, b); } SetInternalFlagsFromInteger(bb, sum, instr.generates_cc); - SetRegister(bb, instr.gpr0, sum); + SetRegister(bb, instr.gpr0, std::move(sum)); return pc; } diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index a0a7b9111..a1828546e 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -299,7 +299,7 @@ private: u32 index{}; ///< Emulated index given for the this sampler. u32 offset{}; ///< Offset in the const buffer from where the sampler is being read. u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size{}; ///< Size of the sampler if indexed. + u32 size{1}; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) bool is_array{}; ///< Whether the texture is being sampled as an array texture or not. diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index b3dcd291c..76c56abb5 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) return OperationCode::UBitwiseXor; case OperationCode::IBitwiseNot: return OperationCode::UBitwiseNot; + case OperationCode::IBitfieldExtract: + return OperationCode::UBitfieldExtract; case OperationCode::IBitfieldInsert: return OperationCode::UBitfieldInsert; case OperationCode::IBitCount: diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp new file mode 100644 index 000000000..af70b3f35 --- /dev/null +++ b/src/video_core/shader/registry.cpp @@ -0,0 +1,161 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <tuple> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" + +namespace VideoCommon::Shader { + +using Tegra::Engines::ConstBufferEngineInterface; +using Tegra::Engines::SamplerDescriptor; +using Tegra::Engines::ShaderType; + +namespace { + +GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { + if (shader_stage == ShaderType::Compute) { + return {}; + } + auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine); + + GraphicsInfo info; + info.tfb_layouts = graphics.regs.tfb_layouts; + info.tfb_varying_locs = graphics.regs.tfb_varying_locs; + info.primitive_topology = graphics.regs.draw.topology; + info.tessellation_primitive = graphics.regs.tess_mode.prim; + info.tessellation_spacing = graphics.regs.tess_mode.spacing; + info.tfb_enabled = graphics.regs.tfb_enabled; + info.tessellation_clockwise = graphics.regs.tess_mode.cw; + return info; +} + +ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { + if (shader_stage != ShaderType::Compute) { + return {}; + } + auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine); + const auto& launch = compute.launch_description; + + ComputeInfo info; + info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}; + info.local_memory_size_in_words = launch.local_pos_alloc; + info.shared_memory_size_in_words = launch.shared_alloc; + return info; +} + +} // Anonymous namespace + +Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info) + : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile}, + bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {} + +Registry::Registry(Tegra::Engines::ShaderType shader_stage, + Tegra::Engines::ConstBufferEngineInterface& engine) + : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()}, + graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo( + shader_stage, engine)} {} + +Registry::~Registry() = default; + +std::optional<u32> Registry::ObtainKey(u32 buffer, u32 offset) { + const std::pair<u32, u32> key = {buffer, offset}; + const auto iter = keys.find(key); + if (iter != keys.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const u32 value = engine->AccessConstBuffer32(stage, buffer, offset); + keys.emplace(key, value); + return value; +} + +std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) { + const u32 key = offset; + const auto iter = bound_samplers.find(key); + if (iter != bound_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset); + bound_samplers.emplace(key, value); + return value; +} + +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, + u32 offset) { + const std::pair key = {buffer, offset}; + const auto iter = bindless_samplers.find(key); + if (iter != bindless_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset); + bindless_samplers.emplace(key, value); + return value; +} + +void Registry::InsertKey(u32 buffer, u32 offset, u32 value) { + keys.insert_or_assign({buffer, offset}, value); +} + +void Registry::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) { + bound_samplers.insert_or_assign(offset, sampler); +} + +void Registry::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) { + bindless_samplers.insert_or_assign({buffer, offset}, sampler); +} + +bool Registry::IsConsistent() const { + if (!engine) { + return true; + } + return std::all_of(keys.begin(), keys.end(), + [this](const auto& pair) { + const auto [cbuf, offset] = pair.first; + const auto value = pair.second; + return value == engine->AccessConstBuffer32(stage, cbuf, offset); + }) && + std::all_of(bound_samplers.begin(), bound_samplers.end(), + [this](const auto& sampler) { + const auto [key, value] = sampler; + return value == engine->AccessBoundSampler(stage, key); + }) && + std::all_of(bindless_samplers.begin(), bindless_samplers.end(), + [this](const auto& sampler) { + const auto [cbuf, offset] = sampler.first; + const auto value = sampler.second; + return value == engine->AccessBindlessSampler(stage, cbuf, offset); + }); +} + +bool Registry::HasEqualKeys(const Registry& rhs) const { + return std::tie(keys, bound_samplers, bindless_samplers) == + std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers); +} + +const GraphicsInfo& Registry::GetGraphicsInfo() const { + ASSERT(stage != Tegra::Engines::ShaderType::Compute); + return graphics_info; +} + +const ComputeInfo& Registry::GetComputeInfo() const { + ASSERT(stage == Tegra::Engines::ShaderType::Compute); + return compute_info; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h new file mode 100644 index 000000000..0c80d35fd --- /dev/null +++ b/src/video_core/shader/registry.h @@ -0,0 +1,137 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <optional> +#include <type_traits> +#include <unordered_map> +#include <utility> + +#include "common/common_types.h" +#include "common/hash.h" +#include "video_core/engines/const_buffer_engine_interface.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/shader_type.h" +#include "video_core/guest_driver.h" + +namespace VideoCommon::Shader { + +using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; +using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using BindlessSamplerMap = + std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; + +struct GraphicsInfo { + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + std::array<Maxwell::TransformFeedbackLayout, Maxwell::NumTransformFeedbackBuffers> + tfb_layouts{}; + std::array<std::array<u8, 128>, Maxwell::NumTransformFeedbackBuffers> tfb_varying_locs{}; + Maxwell::PrimitiveTopology primitive_topology{}; + Maxwell::TessellationPrimitive tessellation_primitive{}; + Maxwell::TessellationSpacing tessellation_spacing{}; + bool tfb_enabled = false; + bool tessellation_clockwise = false; +}; +static_assert(std::is_trivially_copyable_v<GraphicsInfo> && + std::is_standard_layout_v<GraphicsInfo>); + +struct ComputeInfo { + std::array<u32, 3> workgroup_size{}; + u32 shared_memory_size_in_words = 0; + u32 local_memory_size_in_words = 0; +}; +static_assert(std::is_trivially_copyable_v<ComputeInfo> && std::is_standard_layout_v<ComputeInfo>); + +struct SerializedRegistryInfo { + VideoCore::GuestDriverProfile guest_driver_profile; + u32 bound_buffer = 0; + GraphicsInfo graphics; + ComputeInfo compute; +}; + +/** + * The Registry is a class use to interface the 3D and compute engines with the shader compiler. + * With it, the shader can obtain required data from GPU state and store it for disk shader + * compilation. + */ +class Registry { +public: + explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info); + + explicit Registry(Tegra::Engines::ShaderType shader_stage, + Tegra::Engines::ConstBufferEngineInterface& engine); + + ~Registry(); + + /// Retrieves a key from the registry, if it's registered, it will give the registered value, if + /// not it will obtain it from maxwell3d and register it. + std::optional<u32> ObtainKey(u32 buffer, u32 offset); + + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); + + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); + + /// Inserts a key. + void InsertKey(u32 buffer, u32 offset, u32 value); + + /// Inserts a bound sampler key. + void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler); + + /// Inserts a bindless sampler key. + void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler); + + /// Checks keys and samplers against engine's current const buffers. + /// Returns true if they are the same value, false otherwise. + bool IsConsistent() const; + + /// Returns true if the keys are equal to the other ones in the registry. + bool HasEqualKeys(const Registry& rhs) const; + + /// Returns graphics information from this shader + const GraphicsInfo& GetGraphicsInfo() const; + + /// Returns compute information from this shader + const ComputeInfo& GetComputeInfo() const; + + /// Gives an getter to the const buffer keys in the database. + const KeyMap& GetKeys() const { + return keys; + } + + /// Gets samplers database. + const BoundSamplerMap& GetBoundSamplers() const { + return bound_samplers; + } + + /// Gets bindless samplers database. + const BindlessSamplerMap& GetBindlessSamplers() const { + return bindless_samplers; + } + + /// Gets bound buffer used on this shader + u32 GetBoundBuffer() const { + return bound_buffer; + } + + /// Obtains access to the guest driver's profile. + VideoCore::GuestDriverProfile& AccessGuestDriverProfile() { + return engine ? engine->AccessGuestDriverProfile() : stored_guest_driver_profile; + } + +private: + const Tegra::Engines::ShaderType stage; + VideoCore::GuestDriverProfile stored_guest_driver_profile; + Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; + KeyMap keys; + BoundSamplerMap bound_samplers; + BindlessSamplerMap bindless_samplers; + u32 bound_buffer; + GraphicsInfo graphics_info; + ComputeInfo compute_info; +}; + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 3a5d280a9..425927777 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -11,6 +11,7 @@ #include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/node_helper.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -24,8 +25,8 @@ using Tegra::Shader::PredOperation; using Tegra::Shader::Register; ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - ConstBufferLocker& locker) - : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} { + Registry& registry) + : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} { Decode(); PostDecode(); } diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index b0851c3be..dde036b40 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -18,8 +18,8 @@ #include "video_core/engines/shader_header.h" #include "video_core/shader/ast.h" #include "video_core/shader/compiler_settings.h" -#include "video_core/shader/const_buffer_locker.h" #include "video_core/shader/node.h" +#include "video_core/shader/registry.h" namespace VideoCommon::Shader { @@ -69,7 +69,7 @@ struct GlobalMemoryUsage { class ShaderIR final { public: explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - ConstBufferLocker& locker); + Registry& registry); ~ShaderIR(); const std::map<u32, NodeBlock>& GetBasicBlocks() const { @@ -414,7 +414,7 @@ private: const ProgramCode& program_code; const u32 main_offset; const CompilerSettings settings; - ConstBufferLocker& locker; + Registry& registry; bool decompiled{}; bool disable_flow_stack{}; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index face8c943..10739b37d 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -81,26 +81,20 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); return {tracked, track}; } else if (const auto operation = std::get_if<OperationNode>(&*offset)) { - auto bound_buffer = locker.ObtainBoundBuffer(); - if (!bound_buffer) { + const u32 bound_buffer = registry.GetBoundBuffer(); + if (bound_buffer != cbuf->GetIndex()) { return {}; } - if (*bound_buffer != cbuf->GetIndex()) { - return {}; - } - auto pair = DecoupleIndirectRead(*operation); + const auto pair = DecoupleIndirectRead(*operation); if (!pair) { return {}; } auto [gpr, base_offset] = *pair; const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); - auto gpu_driver = locker.AccessGuestDriverProfile(); - if (gpu_driver == nullptr) { - return {}; - } + const auto& gpu_driver = registry.AccessGuestDriverProfile(); const u32 bindless_cv = NewCustomVariable(); - const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr, - Immediate(gpu_driver->GetTextureHandlerSize())); + const Node op = + Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); const Node cv_node = GetCustomVariable(bindless_cv); Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); @@ -157,13 +151,21 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) { return {}; } - // Reduce the cursor in one to avoid infinite loops when the instruction sets the same - // register that it uses as operand - const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1); - if (!source) { - return {}; + s64 current_cursor = cursor; + while (current_cursor > 0) { + // Reduce the cursor in one to avoid infinite loops when the instruction sets the same + // register that it uses as operand + const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1); + current_cursor = new_cursor; + if (!source) { + continue; + } + const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor); + if (base_address != nullptr) { + return {base_address, index, offset}; + } } - return TrackCbuf(source, code, new_cursor); + return {}; } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp new file mode 100644 index 000000000..22a933761 --- /dev/null +++ b/src/video_core/shader/transform_feedback.cpp @@ -0,0 +1,115 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <unordered_map> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/transform_feedback.h" + +namespace VideoCommon::Shader { + +namespace { + +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + +// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20 + +/// Attribute offsets that describe a vector +constexpr std::array VECTORS = { + 28, // gl_Position + 32, // Generic 0 + 36, // Generic 1 + 40, // Generic 2 + 44, // Generic 3 + 48, // Generic 4 + 52, // Generic 5 + 56, // Generic 6 + 60, // Generic 7 + 64, // Generic 8 + 68, // Generic 9 + 72, // Generic 10 + 76, // Generic 11 + 80, // Generic 12 + 84, // Generic 13 + 88, // Generic 14 + 92, // Generic 15 + 96, // Generic 16 + 100, // Generic 17 + 104, // Generic 18 + 108, // Generic 19 + 112, // Generic 20 + 116, // Generic 21 + 120, // Generic 22 + 124, // Generic 23 + 128, // Generic 24 + 132, // Generic 25 + 136, // Generic 26 + 140, // Generic 27 + 144, // Generic 28 + 148, // Generic 29 + 152, // Generic 30 + 156, // Generic 31 + 160, // gl_FrontColor + 164, // gl_FrontSecondaryColor + 160, // gl_BackColor + 164, // gl_BackSecondaryColor + 192, // gl_TexCoord[0] + 196, // gl_TexCoord[1] + 200, // gl_TexCoord[2] + 204, // gl_TexCoord[3] + 208, // gl_TexCoord[4] + 212, // gl_TexCoord[5] + 216, // gl_TexCoord[6] + 220, // gl_TexCoord[7] +}; +} // namespace + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) { + + std::unordered_map<u8, VaryingTFB> tfb; + + for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) { + const auto& locations = info.tfb_varying_locs[buffer]; + const auto& layout = info.tfb_layouts[buffer]; + const std::size_t varying_count = layout.varying_count; + + std::size_t highest = 0; + + for (std::size_t offset = 0; offset < varying_count; ++offset) { + const std::size_t base_offset = offset; + const u8 location = locations[offset]; + + VaryingTFB varying; + varying.buffer = layout.stream; + varying.stride = layout.stride; + varying.offset = offset * sizeof(u32); + varying.components = 1; + + if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) { + UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB"); + + const u8 base_index = location / 4; + while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) { + ++offset; + ++varying.components; + } + } + + [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second; + UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored"); + + highest = std::max(highest, (base_offset + varying.components) * sizeof(u32)); + } + + UNIMPLEMENTED_IF(highest != layout.stride); + } + return tfb; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h new file mode 100644 index 000000000..77d05f64c --- /dev/null +++ b/src/video_core/shader/transform_feedback.h @@ -0,0 +1,23 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <unordered_map> + +#include "common/common_types.h" +#include "video_core/shader/registry.h" + +namespace VideoCommon::Shader { + +struct VaryingTFB { + std::size_t buffer; + std::size_t stride; + std::size_t offset; + std::size_t components; +}; + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info); + +} // namespace VideoCommon::Shader diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 9707c353d..cc7181229 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::RGBA16F; case Tegra::RenderTargetFormat::RGBA16_UNORM: return PixelFormat::RGBA16U; + case Tegra::RenderTargetFormat::RGBA16_SNORM: + return PixelFormat::RGBA16S; case Tegra::RenderTargetFormat::RGBA16_UINT: return PixelFormat::RGBA16UI; case Tegra::RenderTargetFormat::RGBA32_FLOAT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index d88109e5a..ae8817465 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -25,82 +25,83 @@ enum class PixelFormat { R8UI = 7, RGBA16F = 8, RGBA16U = 9, - RGBA16UI = 10, - R11FG11FB10F = 11, - RGBA32UI = 12, - DXT1 = 13, - DXT23 = 14, - DXT45 = 15, - DXN1 = 16, // This is also known as BC4 - DXN2UNORM = 17, - DXN2SNORM = 18, - BC7U = 19, - BC6H_UF16 = 20, - BC6H_SF16 = 21, - ASTC_2D_4X4 = 22, - BGRA8 = 23, - RGBA32F = 24, - RG32F = 25, - R32F = 26, - R16F = 27, - R16U = 28, - R16S = 29, - R16UI = 30, - R16I = 31, - RG16 = 32, - RG16F = 33, - RG16UI = 34, - RG16I = 35, - RG16S = 36, - RGB32F = 37, - RGBA8_SRGB = 38, - RG8U = 39, - RG8S = 40, - RG32UI = 41, - RGBX16F = 42, - R32UI = 43, - R32I = 44, - ASTC_2D_8X8 = 45, - ASTC_2D_8X5 = 46, - ASTC_2D_5X4 = 47, - BGRA8_SRGB = 48, - DXT1_SRGB = 49, - DXT23_SRGB = 50, - DXT45_SRGB = 51, - BC7U_SRGB = 52, - R4G4B4A4U = 53, - ASTC_2D_4X4_SRGB = 54, - ASTC_2D_8X8_SRGB = 55, - ASTC_2D_8X5_SRGB = 56, - ASTC_2D_5X4_SRGB = 57, - ASTC_2D_5X5 = 58, - ASTC_2D_5X5_SRGB = 59, - ASTC_2D_10X8 = 60, - ASTC_2D_10X8_SRGB = 61, - ASTC_2D_6X6 = 62, - ASTC_2D_6X6_SRGB = 63, - ASTC_2D_10X10 = 64, - ASTC_2D_10X10_SRGB = 65, - ASTC_2D_12X12 = 66, - ASTC_2D_12X12_SRGB = 67, - ASTC_2D_8X6 = 68, - ASTC_2D_8X6_SRGB = 69, - ASTC_2D_6X5 = 70, - ASTC_2D_6X5_SRGB = 71, - E5B9G9R9F = 72, + RGBA16S = 10, + RGBA16UI = 11, + R11FG11FB10F = 12, + RGBA32UI = 13, + DXT1 = 14, + DXT23 = 15, + DXT45 = 16, + DXN1 = 17, // This is also known as BC4 + DXN2UNORM = 18, + DXN2SNORM = 19, + BC7U = 20, + BC6H_UF16 = 21, + BC6H_SF16 = 22, + ASTC_2D_4X4 = 23, + BGRA8 = 24, + RGBA32F = 25, + RG32F = 26, + R32F = 27, + R16F = 28, + R16U = 29, + R16S = 30, + R16UI = 31, + R16I = 32, + RG16 = 33, + RG16F = 34, + RG16UI = 35, + RG16I = 36, + RG16S = 37, + RGB32F = 38, + RGBA8_SRGB = 39, + RG8U = 40, + RG8S = 41, + RG32UI = 42, + RGBX16F = 43, + R32UI = 44, + R32I = 45, + ASTC_2D_8X8 = 46, + ASTC_2D_8X5 = 47, + ASTC_2D_5X4 = 48, + BGRA8_SRGB = 49, + DXT1_SRGB = 50, + DXT23_SRGB = 51, + DXT45_SRGB = 52, + BC7U_SRGB = 53, + R4G4B4A4U = 54, + ASTC_2D_4X4_SRGB = 55, + ASTC_2D_8X8_SRGB = 56, + ASTC_2D_8X5_SRGB = 57, + ASTC_2D_5X4_SRGB = 58, + ASTC_2D_5X5 = 59, + ASTC_2D_5X5_SRGB = 60, + ASTC_2D_10X8 = 61, + ASTC_2D_10X8_SRGB = 62, + ASTC_2D_6X6 = 63, + ASTC_2D_6X6_SRGB = 64, + ASTC_2D_10X10 = 65, + ASTC_2D_10X10_SRGB = 66, + ASTC_2D_12X12 = 67, + ASTC_2D_12X12_SRGB = 68, + ASTC_2D_8X6 = 69, + ASTC_2D_8X6_SRGB = 70, + ASTC_2D_6X5 = 71, + ASTC_2D_6X5_SRGB = 72, + E5B9G9R9F = 73, MaxColorFormat, // Depth formats - Z32F = 73, - Z16 = 74, + Z32F = 74, + Z16 = 75, MaxDepthFormat, // DepthStencil formats - Z24S8 = 75, - S8Z24 = 76, - Z32FS8 = 77, + Z24S8 = 76, + S8Z24 = 77, + Z32FS8 = 78, MaxDepthStencilFormat, @@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // R8UI 0, // RGBA16F 0, // RGBA16U + 0, // RGBA16S 0, // RGBA16UI 0, // R11FG11FB10F 0, // RGBA32UI @@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 8, // R8UI 64, // RGBA16F 64, // RGBA16U + 64, // RGBA16S 64, // RGBA16UI 32, // R11FG11FB10F 128, // RGBA32UI @@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // R8UI SurfaceCompression::None, // RGBA16F SurfaceCompression::None, // RGBA16U + SurfaceCompression::None, // RGBA16S SurfaceCompression::None, // RGBA16UI SurfaceCompression::None, // R11FG11FB10F SurfaceCompression::None, // RGBA32UI diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index cc3ad8417..e151c26c4 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 75> DefinitionTable = {{ +constexpr std::array<Table, 76> DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{ {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, + {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S}, {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index f00839313..9931c5ef7 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -113,8 +113,10 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta params.height = tic.Height(); params.depth = tic.Depth(); params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { + if (params.target == SurfaceTarget::Texture2D && params.depth > 1) { + params.depth = 1; + } else if (params.target == SurfaceTarget::TextureCubemap || + params.target == SurfaceTarget::TextureCubeArray) { params.depth *= 6; } params.num_levels = tic.max_mip_level + 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c70e4aec2..6cdbe63d0 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -22,6 +22,7 @@ #include "core/core.h" #include "core/memory.h" #include "core/settings.h" +#include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" @@ -103,6 +104,11 @@ public: if (!cache_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } + + if (!IsTypeCompatible(tic.texture_type, entry)) { + return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); + } + const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false); if (guard_samplers) { @@ -142,11 +148,10 @@ public: TView GetDepthBufferSurface(bool preserve_contents) { std::lock_guard lock{mutex}; auto& maxwell3d = system.GPU().Maxwell3D(); - - if (!maxwell3d.dirty.depth_buffer) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { return depth_buffer.view; } - maxwell3d.dirty.depth_buffer = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; const auto& regs{maxwell3d.regs}; const auto gpu_addr{regs.zeta.Address()}; @@ -175,10 +180,10 @@ public: std::lock_guard lock{mutex}; ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.render_target[index]) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) { return render_targets[index].view; } - maxwell3d.dirty.render_target[index] = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = false; const auto& regs{maxwell3d.regs}; if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || @@ -320,14 +325,14 @@ protected: virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0; void ManageRenderTargetUnregister(TSurface& surface) { - auto& maxwell3d = system.GPU().Maxwell3D(); + auto& dirty = system.GPU().Maxwell3D().dirty; const u32 index = surface->GetRenderTarget(); if (index == DEPTH_RT) { - maxwell3d.dirty.depth_buffer = true; + dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true; } else { - maxwell3d.dirty.render_target[index] = true; + dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = true; } - maxwell3d.dirty.render_settings = true; + dirty.flags[VideoCommon::Dirty::RenderTargets] = true; } void Register(TSurface surface) { @@ -914,13 +919,15 @@ private: params.width = 1; params.height = 1; params.depth = 1; + if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) { + params.depth = 6; + } params.pitch = 4; params.num_levels = 1; params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::RGBA16F; + params.pixel_format = VideoCore::Surface::PixelFormat::R8U; params.type = VideoCore::Surface::SurfaceType::ColorTexture; auto surface = CreateSurface(0ULL, params); - invalid_memory.clear(); invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); surface->UploadTexture(invalid_memory); surface->MarkAsModified(false, Tick()); @@ -1082,6 +1089,36 @@ private: return siblings_table[static_cast<std::size_t>(format)]; } + /// Returns true the shader sampler entry is compatible with the TIC texture type. + static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, + const VideoCommon::Shader::Sampler& entry) { + const auto shader_type = entry.GetType(); + switch (tic_type) { + case Tegra::Texture::TextureType::Texture1D: + case Tegra::Texture::TextureType::Texture1DArray: + return shader_type == Tegra::Shader::TextureType::Texture1D; + case Tegra::Texture::TextureType::Texture1DBuffer: + // TODO(Rodrigo): Assume as valid for now + return true; + case Tegra::Texture::TextureType::Texture2D: + case Tegra::Texture::TextureType::Texture2DNoMipmap: + return shader_type == Tegra::Shader::TextureType::Texture2D; + case Tegra::Texture::TextureType::Texture2DArray: + return shader_type == Tegra::Shader::TextureType::Texture2D || + shader_type == Tegra::Shader::TextureType::TextureCube; + case Tegra::Texture::TextureType::Texture3D: + return shader_type == Tegra::Shader::TextureType::Texture3D; + case Tegra::Texture::TextureType::TextureCubeArray: + case Tegra::Texture::TextureType::TextureCubemap: + if (shader_type == Tegra::Shader::TextureType::TextureCube) { + return true; + } + return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray(); + } + UNREACHABLE(); + return true; + } + struct FramebufferTargetInfo { TSurface target; TView view; diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 33bd31865..062b4f252 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -17,26 +17,37 @@ #include <algorithm> #include <cassert> -#include <cstdint> #include <cstring> #include <vector> +#include "common/common_types.h" + #include "video_core/textures/astc.h" +namespace { + +/// Count the number of bits set in a number. +constexpr u32 Popcnt(u32 n) { + u32 c = 0; + for (; n; c++) { + n &= n - 1; + } + return c; +} + +} // Anonymous namespace + class InputBitStream { public: - explicit InputBitStream(const unsigned char* ptr, int start_offset = 0) + explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) : m_CurByte(ptr), m_NextBit(start_offset % 8) {} - ~InputBitStream() = default; - - int GetBitsRead() const { + std::size_t GetBitsRead() const { return m_BitsRead; } - int ReadBit() { - - int bit = *m_CurByte >> m_NextBit++; + u32 ReadBit() { + u32 bit = *m_CurByte >> m_NextBit++; while (m_NextBit >= 8) { m_NextBit -= 8; m_CurByte++; @@ -46,57 +57,66 @@ public: return bit & 1; } - unsigned int ReadBits(unsigned int nBits) { - unsigned int ret = 0; - for (unsigned int i = 0; i < nBits; i++) { + u32 ReadBits(std::size_t nBits) { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { + ret |= (ReadBit() & 1) << i; + } + return ret; + } + + template <std::size_t nBits> + u32 ReadBits() { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; } return ret; } private: - const unsigned char* m_CurByte; - int m_NextBit = 0; - int m_BitsRead = 0; + const u8* m_CurByte; + std::size_t m_NextBit = 0; + std::size_t m_BitsRead = 0; }; class OutputBitStream { public: - explicit OutputBitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0) + explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} ~OutputBitStream() = default; - int GetBitsWritten() const { + s32 GetBitsWritten() const { return m_BitsWritten; } - void WriteBitsR(unsigned int val, unsigned int nBits) { - for (unsigned int i = 0; i < nBits; i++) { + void WriteBitsR(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { WriteBit((val >> (nBits - i - 1)) & 1); } } - void WriteBits(unsigned int val, unsigned int nBits) { - for (unsigned int i = 0; i < nBits; i++) { + void WriteBits(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { WriteBit((val >> i) & 1); } } private: - void WriteBit(int b) { + void WriteBit(s32 b) { if (done) return; - const unsigned int mask = 1 << m_NextBit++; + const u32 mask = 1 << m_NextBit++; // clear the bit - *m_CurByte &= static_cast<unsigned char>(~mask); + *m_CurByte &= static_cast<u8>(~mask); // Write the bit, if necessary if (b) - *m_CurByte |= static_cast<unsigned char>(mask); + *m_CurByte |= static_cast<u8>(mask); // Next byte? if (m_NextBit >= 8) { @@ -107,10 +127,10 @@ private: done = done || ++m_BitsWritten >= m_NumBits; } - int m_BitsWritten = 0; - const int m_NumBits; - unsigned char* m_CurByte; - int m_NextBit = 0; + s32 m_BitsWritten = 0; + const s32 m_NumBits; + u8* m_CurByte; + s32 m_NextBit = 0; bool done = false; }; @@ -123,20 +143,20 @@ public: Bits(const Bits&) = delete; Bits& operator=(const Bits&) = delete; - uint8_t operator[](uint32_t bitPos) const { - return static_cast<uint8_t>((m_Bits >> bitPos) & 1); + u8 operator[](u32 bitPos) const { + return static_cast<u8>((m_Bits >> bitPos) & 1); } - IntType operator()(uint32_t start, uint32_t end) const { + IntType operator()(u32 start, u32 end) const { if (start == end) { return (*this)[start]; } else if (start > end) { - uint32_t t = start; + u32 t = start; start = end; end = t; } - uint64_t mask = (1 << (end - start + 1)) - 1; + u64 mask = (1 << (end - start + 1)) - 1; return (m_Bits >> start) & static_cast<IntType>(mask); } @@ -144,273 +164,236 @@ private: const IntType& m_Bits; }; -enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit }; - -class IntegerEncodedValue { -private: - const EIntegerEncoding m_Encoding; - const uint32_t m_NumBits; - uint32_t m_BitValue; - union { - uint32_t m_QuintValue; - uint32_t m_TritValue; - }; +enum class IntegerEncoding { JustBits, Qus32, Trit }; -public: - // Jank, but we're not doing any heavy lifting in this class, so it's - // probably OK. It allows us to use these in std::vectors... - IntegerEncodedValue& operator=(const IntegerEncodedValue& other) { - new (this) IntegerEncodedValue(other); - return *this; - } +struct IntegerEncodedValue { + constexpr IntegerEncodedValue() = default; - IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits) - : m_Encoding(encoding), m_NumBits(numBits) {} + constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) + : encoding{encoding_}, num_bits{num_bits_} {} - EIntegerEncoding GetEncoding() const { - return m_Encoding; - } - uint32_t BaseBitLength() const { - return m_NumBits; - } - - uint32_t GetBitValue() const { - return m_BitValue; - } - void SetBitValue(uint32_t val) { - m_BitValue = val; - } - - uint32_t GetTritValue() const { - return m_TritValue; - } - void SetTritValue(uint32_t val) { - m_TritValue = val; - } - - uint32_t GetQuintValue() const { - return m_QuintValue; - } - void SetQuintValue(uint32_t val) { - m_QuintValue = val; - } - - bool MatchesEncoding(const IntegerEncodedValue& other) const { - return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits; + constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { + return encoding == other.encoding && num_bits == other.num_bits; } // Returns the number of bits required to encode nVals values. - uint32_t GetBitLength(uint32_t nVals) const { - uint32_t totalBits = m_NumBits * nVals; - if (m_Encoding == eIntegerEncoding_Trit) { + u32 GetBitLength(u32 nVals) const { + u32 totalBits = num_bits * nVals; + if (encoding == IntegerEncoding::Trit) { totalBits += (nVals * 8 + 4) / 5; - } else if (m_Encoding == eIntegerEncoding_Quint) { + } else if (encoding == IntegerEncoding::Qus32) { totalBits += (nVals * 7 + 2) / 3; } return totalBits; } - // Count the number of bits set in a number. - static inline uint32_t Popcnt(uint32_t n) { - uint32_t c; - for (c = 0; n; c++) { - n &= n - 1; + IntegerEncoding encoding{}; + u32 num_bits = 0; + u32 bit_value = 0; + union { + u32 qus32_value = 0; + u32 trit_value; + }; +}; + +static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, + u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + u32 m[5]; + u32 t[5]; + u32 T; + + // Read the trit encoded block according to + // table C.2.14 + m[0] = bits.ReadBits(nBitsPerValue); + T = bits.ReadBits<2>(); + m[1] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 2; + m[2] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 4; + m[3] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 5; + m[4] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 7; + + u32 C = 0; + + Bits<u32> Tb(T); + if (Tb(2, 4) == 7) { + C = (Tb(5, 7) << 2) | Tb(0, 1); + t[4] = t[3] = 2; + } else { + C = Tb(0, 4); + if (Tb(5, 6) == 3) { + t[4] = 2; + t[3] = Tb[7]; + } else { + t[4] = Tb[7]; + t[3] = Tb(5, 6); } - return c; } - // Returns a new instance of this struct that corresponds to the - // can take no more than maxval values - static IntegerEncodedValue CreateEncoding(uint32_t maxVal) { - while (maxVal > 0) { - uint32_t check = maxVal + 1; - - // Is maxVal a power of two? - if (!(check & (check - 1))) { - return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal)); - } - - // Is maxVal of the type 3*2^n - 1? - if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { - return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1)); - } + Bits<u32> Cb(C); + if (Cb(0, 1) == 3) { + t[2] = 2; + t[1] = Cb[4]; + t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); + } else if (Cb(2, 3) == 3) { + t[2] = 2; + t[1] = 2; + t[0] = Cb(0, 1); + } else { + t[2] = Cb[4]; + t[1] = Cb(2, 3); + t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); + } - // Is maxVal of the type 5*2^n - 1? - if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { - return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1)); - } + for (std::size_t i = 0; i < 5; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue); + val.bit_value = m[i]; + val.trit_value = t[i]; + } +} - // Apparently it can't be represented with a bounded integer sequence... - // just iterate. - maxVal--; +static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, + u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + u32 m[3]; + u32 q[3]; + u32 Q; + + // Read the trit encoded block according to + // table C.2.15 + m[0] = bits.ReadBits(nBitsPerValue); + Q = bits.ReadBits<3>(); + m[1] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 3; + m[2] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 5; + + Bits<u32> Qb(Q); + if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { + q[0] = q[1] = 4; + q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); + } else { + u32 C = 0; + if (Qb(1, 2) == 3) { + q[2] = 4; + C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; + } else { + q[2] = Qb(5, 6); + C = Qb(0, 4); } - return IntegerEncodedValue(eIntegerEncoding_JustBits, 0); - } - - // Fills result with the values that are encoded in the given - // bitstream. We must know beforehand what the maximum possible - // value is, and how many values we're decoding. - static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, - InputBitStream& bits, uint32_t maxRange, uint32_t nValues) { - // Determine encoding parameters - IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange); - - // Start decoding - uint32_t nValsDecoded = 0; - while (nValsDecoded < nValues) { - switch (val.GetEncoding()) { - case eIntegerEncoding_Quint: - DecodeQuintBlock(bits, result, val.BaseBitLength()); - nValsDecoded += 3; - break; - case eIntegerEncoding_Trit: - DecodeTritBlock(bits, result, val.BaseBitLength()); - nValsDecoded += 5; - break; - - case eIntegerEncoding_JustBits: - val.SetBitValue(bits.ReadBits(val.BaseBitLength())); - result.push_back(val); - nValsDecoded++; - break; - } + Bits<u32> Cb(C); + if (Cb(0, 2) == 5) { + q[1] = 4; + q[0] = Cb(3, 4); + } else { + q[1] = Cb(3, 4); + q[0] = Cb(0, 2); } } -private: - static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, - uint32_t nBitsPerValue) { - // Implement the algorithm in section C.2.12 - uint32_t m[5]; - uint32_t t[5]; - uint32_t T; - - // Read the trit encoded block according to - // table C.2.14 - m[0] = bits.ReadBits(nBitsPerValue); - T = bits.ReadBits(2); - m[1] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits(2) << 2; - m[2] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 4; - m[3] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits(2) << 5; - m[4] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 7; - - uint32_t C = 0; - - Bits<uint32_t> Tb(T); - if (Tb(2, 4) == 7) { - C = (Tb(5, 7) << 2) | Tb(0, 1); - t[4] = t[3] = 2; - } else { - C = Tb(0, 4); - if (Tb(5, 6) == 3) { - t[4] = 2; - t[3] = Tb[7]; - } else { - t[4] = Tb[7]; - t[3] = Tb(5, 6); - } + for (std::size_t i = 0; i < 3; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue); + val.bit_value = m[i]; + val.qus32_value = q[i]; + } +} + +// Returns a new instance of this struct that corresponds to the +// can take no more than maxval values +static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { + while (maxVal > 0) { + u32 check = maxVal + 1; + + // Is maxVal a power of two? + if (!(check & (check - 1))) { + return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); } - Bits<uint32_t> Cb(C); - if (Cb(0, 1) == 3) { - t[2] = 2; - t[1] = Cb[4]; - t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); - } else if (Cb(2, 3) == 3) { - t[2] = 2; - t[1] = 2; - t[0] = Cb(0, 1); - } else { - t[2] = Cb[4]; - t[1] = Cb(2, 3); - t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); + // Is maxVal of the type 3*2^n - 1? + if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); } - for (uint32_t i = 0; i < 5; i++) { - IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue); - val.SetBitValue(m[i]); - val.SetTritValue(t[i]); - result.push_back(val); + // Is maxVal of the type 5*2^n - 1? + if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); } + + // Apparently it can't be represented with a bounded integer sequence... + // just iterate. + maxVal--; } + return IntegerEncodedValue(IntegerEncoding::JustBits, 0); +} - static void DecodeQuintBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, - uint32_t nBitsPerValue) { - // Implement the algorithm in section C.2.12 - uint32_t m[3]; - uint32_t q[3]; - uint32_t Q; - - // Read the trit encoded block according to - // table C.2.15 - m[0] = bits.ReadBits(nBitsPerValue); - Q = bits.ReadBits(3); - m[1] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits(2) << 3; - m[2] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits(2) << 5; - - Bits<uint32_t> Qb(Q); - if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { - q[0] = q[1] = 4; - q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); - } else { - uint32_t C = 0; - if (Qb(1, 2) == 3) { - q[2] = 4; - C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; - } else { - q[2] = Qb(5, 6); - C = Qb(0, 4); - } +static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { + std::array<IntegerEncodedValue, 256> encodings{}; + for (std::size_t i = 0; i < encodings.size(); ++i) { + encodings[i] = CreateEncoding(static_cast<u32>(i)); + } + return encodings; +} - Bits<uint32_t> Cb(C); - if (Cb(0, 2) == 5) { - q[1] = 4; - q[0] = Cb(3, 4); - } else { - q[1] = Cb(3, 4); - q[0] = Cb(0, 2); - } - } +static constexpr std::array EncodingsValues = MakeEncodedValues(); + +// Fills result with the values that are encoded in the given +// bitstream. We must know beforehand what the maximum possible +// value is, and how many values we're decoding. +static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, + u32 maxRange, u32 nValues) { + // Determine encoding parameters + IntegerEncodedValue val = EncodingsValues[maxRange]; + + // Start decoding + u32 nValsDecoded = 0; + while (nValsDecoded < nValues) { + switch (val.encoding) { + case IntegerEncoding::Qus32: + DecodeQus32Block(bits, result, val.num_bits); + nValsDecoded += 3; + break; + + case IntegerEncoding::Trit: + DecodeTritBlock(bits, result, val.num_bits); + nValsDecoded += 5; + break; - for (uint32_t i = 0; i < 3; i++) { - IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue); - val.m_BitValue = m[i]; - val.m_QuintValue = q[i]; + case IntegerEncoding::JustBits: + val.bit_value = bits.ReadBits(val.num_bits); result.push_back(val); + nValsDecoded++; + break; } } -}; +} namespace ASTCC { struct TexelWeightParams { - uint32_t m_Width = 0; - uint32_t m_Height = 0; + u32 m_Width = 0; + u32 m_Height = 0; bool m_bDualPlane = false; - uint32_t m_MaxWeight = 0; + u32 m_MaxWeight = 0; bool m_bError = false; bool m_bVoidExtentLDR = false; bool m_bVoidExtentHDR = false; - uint32_t GetPackedBitSize() const { + u32 GetPackedBitSize() const { // How many indices do we have? - uint32_t nIdxs = m_Height * m_Width; + u32 nIdxs = m_Height * m_Width; if (m_bDualPlane) { nIdxs *= 2; } - return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs); + return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs); } - uint32_t GetNumWeightValues() const { - uint32_t ret = m_Width * m_Height; + u32 GetNumWeightValues() const { + u32 ret = m_Width * m_Height; if (m_bDualPlane) { ret *= 2; } @@ -422,7 +405,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { TexelWeightParams params; // Read the entire block mode all at once - uint16_t modeBits = static_cast<uint16_t>(strm.ReadBits(11)); + u16 modeBits = static_cast<u16>(strm.ReadBits<11>()); // Does this match the void extent block mode? if ((modeBits & 0x01FF) == 0x1FC) { @@ -457,7 +440,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { // of the block mode. Layout is determined by a number // between 0 and 9 corresponding to table C.2.8 of the // ASTC spec. - uint32_t layout = 0; + u32 layout = 0; if ((modeBits & 0x1) || (modeBits & 0x2)) { // layout is in [0-4] @@ -509,7 +492,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { assert(layout < 10); // Determine R - uint32_t R = !!(modeBits & 0x10); + u32 R = !!(modeBits & 0x10); if (layout < 5) { R |= (modeBits & 0x3) << 1; } else { @@ -520,54 +503,54 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { // Determine width & height switch (layout) { case 0: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = B + 4; params.m_Height = A + 2; break; } case 1: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = B + 8; params.m_Height = A + 2; break; } case 2: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = A + 2; params.m_Height = B + 8; break; } case 3: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x1; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; params.m_Width = A + 2; params.m_Height = B + 6; break; } case 4: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x1; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; params.m_Width = B + 2; params.m_Height = A + 2; break; } case 5: { - uint32_t A = (modeBits >> 5) & 0x3; + u32 A = (modeBits >> 5) & 0x3; params.m_Width = 12; params.m_Height = A + 2; break; } case 6: { - uint32_t A = (modeBits >> 5) & 0x3; + u32 A = (modeBits >> 5) & 0x3; params.m_Width = A + 2; params.m_Height = 12; break; @@ -586,15 +569,15 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { } case 9: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 9) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 9) & 0x3; params.m_Width = A + 6; params.m_Height = B + 6; break; } default: - assert(!"Don't know this layout..."); + assert(false && "Don't know this layout..."); params.m_bError = true; break; } @@ -605,10 +588,10 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { bool H = (layout != 9) && (modeBits & 0x200); if (H) { - const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31}; + const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31}; params.m_MaxWeight = maxWeights[R - 2]; } else { - const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7}; + const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7}; params.m_MaxWeight = maxWeights[R - 2]; } @@ -617,32 +600,32 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { return params; } -static void FillVoidExtentLDR(InputBitStream& strm, uint32_t* const outBuf, uint32_t blockWidth, - uint32_t blockHeight) { +static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 blockWidth, + u32 blockHeight) { // Don't actually care about the void extent, just read the bits... - for (int i = 0; i < 4; ++i) { - strm.ReadBits(13); + for (s32 i = 0; i < 4; ++i) { + strm.ReadBits<13>(); } // Decode the RGBA components and renormalize them to the range [0, 255] - uint16_t r = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t g = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t b = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t a = static_cast<uint16_t>(strm.ReadBits(16)); + u16 r = static_cast<u16>(strm.ReadBits<16>()); + u16 g = static_cast<u16>(strm.ReadBits<16>()); + u16 b = static_cast<u16>(strm.ReadBits<16>()); + u16 a = static_cast<u16>(strm.ReadBits<16>()); - uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 | - (static_cast<uint32_t>(a) & 0xFF00) << 16; + u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | + (static_cast<u32>(a) & 0xFF00) << 16; - for (uint32_t j = 0; j < blockHeight; j++) { - for (uint32_t i = 0; i < blockWidth; i++) { + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = rgba; } } } -static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) { - for (uint32_t j = 0; j < blockHeight; j++) { - for (uint32_t i = 0; i < blockWidth; i++) { +static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = 0xFFFF00FF; } } @@ -651,18 +634,18 @@ static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeigh // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] // is the same as [(numBits - 1):0] and repeats all the way down. template <typename IntType> -static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) { +static IntType Replicate(IntType val, u32 numBits, u32 toBit) { if (numBits == 0) return 0; if (toBit == 0) return 0; IntType v = val & static_cast<IntType>((1 << numBits) - 1); IntType res = v; - uint32_t reslen = numBits; + u32 reslen = numBits; while (reslen < toBit) { - uint32_t comp = 0; + u32 comp = 0; if (numBits > toBit - reslen) { - uint32_t newshift = toBit - reslen; + u32 newshift = toBit - reslen; comp = numBits - newshift; numBits = newshift; } @@ -675,14 +658,14 @@ static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) { class Pixel { protected: - using ChannelType = int16_t; - uint8_t m_BitDepth[4] = {8, 8, 8, 8}; - int16_t color[4] = {}; + using ChannelType = s16; + u8 m_BitDepth[4] = {8, 8, 8, 8}; + s16 color[4] = {}; public: Pixel() = default; - Pixel(uint32_t a, uint32_t r, uint32_t g, uint32_t b, unsigned bitDepth = 8) - : m_BitDepth{uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth)}, + Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8) + : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)}, color{static_cast<ChannelType>(a), static_cast<ChannelType>(r), static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {} @@ -691,22 +674,22 @@ public: // significant bits when going from larger to smaller bit depth // or by repeating the most significant bits when going from // smaller to larger bit depths. - void ChangeBitDepth(const uint8_t (&depth)[4]) { - for (uint32_t i = 0; i < 4; i++) { + void ChangeBitDepth(const u8 (&depth)[4]) { + for (u32 i = 0; i < 4; i++) { Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); m_BitDepth[i] = depth[i]; } } template <typename IntType> - static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) { + static float ConvertChannelToFloat(IntType channel, u8 bitDepth) { float denominator = static_cast<float>((1 << bitDepth) - 1); return static_cast<float>(channel) / denominator; } // Changes the bit depth of a single component. See the comment // above for how we do this. - static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) { + static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { assert(newDepth <= 8); assert(oldDepth <= 8); @@ -722,16 +705,15 @@ public: if (newDepth == 0) { return 0xFF; } else { - uint8_t bitsWasted = static_cast<uint8_t>(oldDepth - newDepth); - uint16_t v = static_cast<uint16_t>(val); - v = static_cast<uint16_t>((v + (1 << (bitsWasted - 1))) >> bitsWasted); - v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), - static_cast<uint16_t>((1 << newDepth) - 1)); - return static_cast<uint8_t>(v); + u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); + u16 v = static_cast<u16>(val); + v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); + v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1)); + return static_cast<u8>(v); } } - assert(!"We shouldn't get here."); + assert(false && "We shouldn't get here."); return 0; } @@ -759,15 +741,15 @@ public: ChannelType& B() { return color[3]; } - const ChannelType& Component(uint32_t idx) const { + const ChannelType& Component(u32 idx) const { return color[idx]; } - ChannelType& Component(uint32_t idx) { + ChannelType& Component(u32 idx) { return color[idx]; } - void GetBitDepth(uint8_t (&outDepth)[4]) const { - for (int i = 0; i < 4; i++) { + void GetBitDepth(u8 (&outDepth)[4]) const { + for (s32 i = 0; i < 4; i++) { outDepth[i] = m_BitDepth[i]; } } @@ -776,12 +758,12 @@ public: // and then pack each channel into an R8G8B8A8 32-bit integer. We assume // that the architecture is little-endian, so the alpha channel will end // up in the most-significant byte. - uint32_t Pack() const { + u32 Pack() const { Pixel eightBit(*this); - const uint8_t eightBitDepth[4] = {8, 8, 8, 8}; + const u8 eightBitDepth[4] = {8, 8, 8, 8}; eightBit.ChangeBitDepth(eightBitDepth); - uint32_t r = 0; + u32 r = 0; r |= eightBit.A(); r <<= 8; r |= eightBit.B(); @@ -794,7 +776,7 @@ public: // Clamps the pixel to the range [0,255] void ClampByte() { - for (uint32_t i = 0; i < 4; i++) { + for (u32 i = 0; i < 4; i++) { color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); } } @@ -804,24 +786,24 @@ public: } }; -static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* modes, - const uint32_t nPartitions, const uint32_t nBitsForColorData) { +static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nPartitions, + const u32 nBitsForColorData) { // First figure out how many color values we have - uint32_t nValues = 0; - for (uint32_t i = 0; i < nPartitions; i++) { + u32 nValues = 0; + for (u32 i = 0; i < nPartitions; i++) { nValues += ((modes[i] >> 2) + 1) << 1; } // Then based on the number of values and the remaining number of bits, // figure out the max value for each of them... - uint32_t range = 256; + u32 range = 256; while (--range > 0) { - IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range); - uint32_t bitLength = val.GetBitLength(nValues); + IntegerEncodedValue val = EncodingsValues[range]; + u32 bitLength = val.GetBitLength(nValues); if (bitLength <= nBitsForColorData) { // Find the smallest possible range that matches the given encoding while (--range > 0) { - IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range); + IntegerEncodedValue newval = EncodingsValues[range]; if (!newval.MatchesEncoding(val)) { break; } @@ -835,12 +817,14 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode // We now have enough to decode our integer sequence. std::vector<IntegerEncodedValue> decodedColorValues; + decodedColorValues.reserve(32); + InputBitStream colorStream(data); - IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); + DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); // Once we have the decoded values, we need to dequantize them to the 0-255 range // This procedure is outlined in ASTC spec C.2.13 - uint32_t outIdx = 0; + u32 outIdx = 0; for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { // Have we already decoded all that we need? if (outIdx >= nValues) { @@ -848,25 +832,25 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode } const IntegerEncodedValue& val = *itr; - uint32_t bitlen = val.BaseBitLength(); - uint32_t bitval = val.GetBitValue(); + u32 bitlen = val.num_bits; + u32 bitval = val.bit_value; assert(bitlen >= 1); - uint32_t A = 0, B = 0, C = 0, D = 0; + u32 A = 0, B = 0, C = 0, D = 0; // A is just the lsb replicated 9 times. A = Replicate(bitval & 1, 1, 9); - switch (val.GetEncoding()) { + switch (val.encoding) { // Replicate bits - case eIntegerEncoding_JustBits: + case IntegerEncoding::JustBits: out[outIdx++] = Replicate(bitval, bitlen, 8); break; // Use algorithm in C.2.13 - case eIntegerEncoding_Trit: { + case IntegerEncoding::Trit: { - D = val.GetTritValue(); + D = val.trit_value; switch (bitlen) { case 1: { @@ -876,48 +860,48 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode case 2: { C = 93; // B = b000b0bb0 - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 8) | (b << 4) | (b << 2) | (b << 1); } break; case 3: { C = 44; // B = cb000cbcb - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 2) | cb; } break; case 4: { C = 22; // B = dcb000dcb - uint32_t dcb = (bitval >> 1) & 7; + u32 dcb = (bitval >> 1) & 7; B = (dcb << 6) | dcb; } break; case 5: { C = 11; // B = edcb000ed - uint32_t edcb = (bitval >> 1) & 0xF; + u32 edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 2); } break; case 6: { C = 5; // B = fedcb000f - uint32_t fedcb = (bitval >> 1) & 0x1F; + u32 fedcb = (bitval >> 1) & 0x1F; B = (fedcb << 4) | (fedcb >> 4); } break; default: - assert(!"Unsupported trit encoding for color values!"); + assert(false && "Unsupported trit encoding for color values!"); break; } // switch(bitlen) - } // case eIntegerEncoding_Trit + } // case IntegerEncoding::Trit break; - case eIntegerEncoding_Quint: { + case IntegerEncoding::Qus32: { - D = val.GetQuintValue(); + D = val.qus32_value; switch (bitlen) { case 1: { @@ -927,41 +911,41 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode case 2: { C = 54; // B = b0000bb00 - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 8) | (b << 3) | (b << 2); } break; case 3: { C = 26; // B = cb0000cbc - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 1) | (cb >> 1); } break; case 4: { C = 13; // B = dcb0000dc - uint32_t dcb = (bitval >> 1) & 7; + u32 dcb = (bitval >> 1) & 7; B = (dcb << 6) | (dcb >> 1); } break; case 5: { C = 6; // B = edcb0000e - uint32_t edcb = (bitval >> 1) & 0xF; + u32 edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 3); } break; default: - assert(!"Unsupported quint encoding for color values!"); + assert(false && "Unsupported quint encoding for color values!"); break; } // switch(bitlen) - } // case eIntegerEncoding_Quint + } // case IntegerEncoding::Qus32 break; - } // switch(val.GetEncoding()) + } // switch(val.encoding) - if (val.GetEncoding() != eIntegerEncoding_JustBits) { - uint32_t T = D * C + B; + if (val.encoding != IntegerEncoding::JustBits) { + u32 T = D * C + B; T ^= A; T = (A & 0x80) | (T >> 2); out[outIdx++] = T; @@ -969,31 +953,31 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode } // Make sure that each of our values is in the proper range... - for (uint32_t i = 0; i < nValues; i++) { + for (u32 i = 0; i < nValues; i++) { assert(out[i] <= 255); } } -static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { - uint32_t bitval = val.GetBitValue(); - uint32_t bitlen = val.BaseBitLength(); +static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { + u32 bitval = val.bit_value; + u32 bitlen = val.num_bits; - uint32_t A = Replicate(bitval & 1, 1, 7); - uint32_t B = 0, C = 0, D = 0; + u32 A = Replicate(bitval & 1, 1, 7); + u32 B = 0, C = 0, D = 0; - uint32_t result = 0; - switch (val.GetEncoding()) { - case eIntegerEncoding_JustBits: + u32 result = 0; + switch (val.encoding) { + case IntegerEncoding::JustBits: result = Replicate(bitval, bitlen, 6); break; - case eIntegerEncoding_Trit: { - D = val.GetTritValue(); + case IntegerEncoding::Trit: { + D = val.trit_value; assert(D < 3); switch (bitlen) { case 0: { - uint32_t results[3] = {0, 32, 63}; + u32 results[3] = {0, 32, 63}; result = results[D]; } break; @@ -1003,29 +987,29 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { case 2: { C = 23; - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 6) | (b << 2) | b; } break; case 3: { C = 11; - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 5) | cb; } break; default: - assert(!"Invalid trit encoding for texel weight"); + assert(false && "Invalid trit encoding for texel weight"); break; } } break; - case eIntegerEncoding_Quint: { - D = val.GetQuintValue(); + case IntegerEncoding::Qus32: { + D = val.qus32_value; assert(D < 5); switch (bitlen) { case 0: { - uint32_t results[5] = {0, 16, 32, 47, 63}; + u32 results[5] = {0, 16, 32, 47, 63}; result = results[D]; } break; @@ -1035,18 +1019,18 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { case 2: { C = 13; - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 6) | (b << 1); } break; default: - assert(!"Invalid quint encoding for texel weight"); + assert(false && "Invalid quint encoding for texel weight"); break; } } break; } - if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) { + if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) { // Decode the value... result = D * C + B; result ^= A; @@ -1063,12 +1047,11 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { return result; } -static void UnquantizeTexelWeights(uint32_t out[2][144], - const std::vector<IntegerEncodedValue>& weights, - const TexelWeightParams& params, const uint32_t blockWidth, - const uint32_t blockHeight) { - uint32_t weightIdx = 0; - uint32_t unquantized[2][144]; +static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, + const TexelWeightParams& params, const u32 blockWidth, + const u32 blockHeight) { + u32 weightIdx = 0; + u32 unquantized[2][144]; for (auto itr = weights.begin(); itr != weights.end(); ++itr) { unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); @@ -1086,34 +1069,34 @@ static void UnquantizeTexelWeights(uint32_t out[2][144], } // Do infill if necessary (Section C.2.18) ... - uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); - uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); + u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); + u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); - const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U; - for (uint32_t plane = 0; plane < kPlaneScale; plane++) - for (uint32_t t = 0; t < blockHeight; t++) - for (uint32_t s = 0; s < blockWidth; s++) { - uint32_t cs = Ds * s; - uint32_t ct = Dt * t; + const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U; + for (u32 plane = 0; plane < kPlaneScale; plane++) + for (u32 t = 0; t < blockHeight; t++) + for (u32 s = 0; s < blockWidth; s++) { + u32 cs = Ds * s; + u32 ct = Dt * t; - uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6; - uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6; + u32 gs = (cs * (params.m_Width - 1) + 32) >> 6; + u32 gt = (ct * (params.m_Height - 1) + 32) >> 6; - uint32_t js = gs >> 4; - uint32_t fs = gs & 0xF; + u32 js = gs >> 4; + u32 fs = gs & 0xF; - uint32_t jt = gt >> 4; - uint32_t ft = gt & 0x0F; + u32 jt = gt >> 4; + u32 ft = gt & 0x0F; - uint32_t w11 = (fs * ft + 8) >> 4; - uint32_t w10 = ft - w11; - uint32_t w01 = fs - w11; - uint32_t w00 = 16 - fs - ft + w11; + u32 w11 = (fs * ft + 8) >> 4; + u32 w10 = ft - w11; + u32 w01 = fs - w11; + u32 w00 = 16 - fs - ft + w11; - uint32_t v0 = js + jt * params.m_Width; + u32 v0 = js + jt * params.m_Width; #define FIND_TEXEL(tidx, bidx) \ - uint32_t p##bidx = 0; \ + u32 p##bidx = 0; \ do { \ if ((tidx) < (params.m_Width * params.m_Height)) { \ p##bidx = unquantized[plane][(tidx)]; \ @@ -1133,7 +1116,7 @@ static void UnquantizeTexelWeights(uint32_t out[2][144], } // Transfers a bit as described in C.2.14 -static inline void BitTransferSigned(int32_t& a, int32_t& b) { +static inline void BitTransferSigned(s32& a, s32& b) { b >>= 1; b |= a & 0x80; a >>= 1; @@ -1144,14 +1127,14 @@ static inline void BitTransferSigned(int32_t& a, int32_t& b) { // Adds more precision to the blue channel as described // in C.2.14 -static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) { - return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1), - static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b)); +static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) { + return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1), + static_cast<s16>((g + b) >> 1), static_cast<s16>(b)); } // Partition selection functions as specified in // C.2.21 -static inline uint32_t hash52(uint32_t p) { +static inline u32 hash52(u32 p) { p ^= p >> 15; p -= p << 17; p += p << 7; @@ -1165,8 +1148,7 @@ static inline uint32_t hash52(uint32_t p) { return p; } -static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, - int32_t partitionCount, int32_t smallBlock) { +static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) { if (1 == partitionCount) return 0; @@ -1178,34 +1160,34 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, seed += (partitionCount - 1) * 1024; - uint32_t rnum = hash52(static_cast<uint32_t>(seed)); - uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF); - uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF); - uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF); - uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF); - uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF); - uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF); - uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF); - uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF); - uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF); - uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF); - uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF); - uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF); - - seed1 = static_cast<uint8_t>(seed1 * seed1); - seed2 = static_cast<uint8_t>(seed2 * seed2); - seed3 = static_cast<uint8_t>(seed3 * seed3); - seed4 = static_cast<uint8_t>(seed4 * seed4); - seed5 = static_cast<uint8_t>(seed5 * seed5); - seed6 = static_cast<uint8_t>(seed6 * seed6); - seed7 = static_cast<uint8_t>(seed7 * seed7); - seed8 = static_cast<uint8_t>(seed8 * seed8); - seed9 = static_cast<uint8_t>(seed9 * seed9); - seed10 = static_cast<uint8_t>(seed10 * seed10); - seed11 = static_cast<uint8_t>(seed11 * seed11); - seed12 = static_cast<uint8_t>(seed12 * seed12); - - int32_t sh1, sh2, sh3; + u32 rnum = hash52(static_cast<u32>(seed)); + u8 seed1 = static_cast<u8>(rnum & 0xF); + u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF); + u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF); + u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF); + u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF); + u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF); + u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF); + u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF); + u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF); + u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF); + u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF); + u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF); + + seed1 = static_cast<u8>(seed1 * seed1); + seed2 = static_cast<u8>(seed2 * seed2); + seed3 = static_cast<u8>(seed3 * seed3); + seed4 = static_cast<u8>(seed4 * seed4); + seed5 = static_cast<u8>(seed5 * seed5); + seed6 = static_cast<u8>(seed6 * seed6); + seed7 = static_cast<u8>(seed7 * seed7); + seed8 = static_cast<u8>(seed8 * seed8); + seed9 = static_cast<u8>(seed9 * seed9); + seed10 = static_cast<u8>(seed10 * seed10); + seed11 = static_cast<u8>(seed11 * seed11); + seed12 = static_cast<u8>(seed12 * seed12); + + s32 sh1, sh2, sh3; if (seed & 1) { sh1 = (seed & 2) ? 4 : 5; sh2 = (partitionCount == 3) ? 6 : 5; @@ -1215,23 +1197,23 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, } sh3 = (seed & 0x10) ? sh1 : sh2; - seed1 = static_cast<uint8_t>(seed1 >> sh1); - seed2 = static_cast<uint8_t>(seed2 >> sh2); - seed3 = static_cast<uint8_t>(seed3 >> sh1); - seed4 = static_cast<uint8_t>(seed4 >> sh2); - seed5 = static_cast<uint8_t>(seed5 >> sh1); - seed6 = static_cast<uint8_t>(seed6 >> sh2); - seed7 = static_cast<uint8_t>(seed7 >> sh1); - seed8 = static_cast<uint8_t>(seed8 >> sh2); - seed9 = static_cast<uint8_t>(seed9 >> sh3); - seed10 = static_cast<uint8_t>(seed10 >> sh3); - seed11 = static_cast<uint8_t>(seed11 >> sh3); - seed12 = static_cast<uint8_t>(seed12 >> sh3); - - int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); - int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); - int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); - int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + seed1 = static_cast<u8>(seed1 >> sh1); + seed2 = static_cast<u8>(seed2 >> sh2); + seed3 = static_cast<u8>(seed3 >> sh1); + seed4 = static_cast<u8>(seed4 >> sh2); + seed5 = static_cast<u8>(seed5 >> sh1); + seed6 = static_cast<u8>(seed6 >> sh2); + seed7 = static_cast<u8>(seed7 >> sh1); + seed8 = static_cast<u8>(seed8 >> sh2); + seed9 = static_cast<u8>(seed9 >> sh3); + seed10 = static_cast<u8>(seed10 >> sh3); + seed11 = static_cast<u8>(seed11 >> sh3); + seed12 = static_cast<u8>(seed12 >> sh3); + + s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); + s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); + s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); + s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); a &= 0x3F; b &= 0x3F; @@ -1252,27 +1234,26 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, return 3; } -static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount, - int32_t smallBlock) { +static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) { return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); } // Section C.2.14 -static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues, - uint32_t colorEndpointMode) { +static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, + u32 colorEndpos32Mode) { #define READ_UINT_VALUES(N) \ - uint32_t v[N]; \ - for (uint32_t i = 0; i < N; i++) { \ + u32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ v[i] = *(colorValues++); \ } #define READ_INT_VALUES(N) \ - int32_t v[N]; \ - for (uint32_t i = 0; i < N; i++) { \ - v[i] = static_cast<int32_t>(*(colorValues++)); \ + s32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ + v[i] = static_cast<s32>(*(colorValues++)); \ } - switch (colorEndpointMode) { + switch (colorEndpos32Mode) { case 0: { READ_UINT_VALUES(2) ep1 = Pixel(0xFF, v[0], v[0], v[0]); @@ -1281,8 +1262,8 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue case 1: { READ_UINT_VALUES(2) - uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0); - uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); + u32 L0 = (v[0] >> 2) | (v[1] & 0xC0); + u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); ep1 = Pixel(0xFF, L0, L0, L0); ep2 = Pixel(0xFF, L1, L1, L1); } break; @@ -1371,7 +1352,7 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue } break; default: - assert(!"Unsupported color endpoint mode (is it HDR?)"); + assert(false && "Unsupported color endpoint mode (is it HDR?)"); break; } @@ -1379,14 +1360,14 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue #undef READ_INT_VALUES } -static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, - const uint32_t blockHeight, uint32_t* outBuf) { +static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 blockHeight, + u32* outBuf) { InputBitStream strm(inBuf); TexelWeightParams weightParams = DecodeBlockInfo(strm); // Was there an error? if (weightParams.m_bError) { - assert(!"Invalid block mode"); + assert(false && "Invalid block mode"); FillError(outBuf, blockWidth, blockHeight); return; } @@ -1397,63 +1378,63 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, } if (weightParams.m_bVoidExtentHDR) { - assert(!"HDR void extent blocks are unsupported!"); + assert(false && "HDR void extent blocks are unsupported!"); FillError(outBuf, blockWidth, blockHeight); return; } if (weightParams.m_Width > blockWidth) { - assert(!"Texel weight grid width should be smaller than block width"); + assert(false && "Texel weight grid width should be smaller than block width"); FillError(outBuf, blockWidth, blockHeight); return; } if (weightParams.m_Height > blockHeight) { - assert(!"Texel weight grid height should be smaller than block height"); + assert(false && "Texel weight grid height should be smaller than block height"); FillError(outBuf, blockWidth, blockHeight); return; } // Read num partitions - uint32_t nPartitions = strm.ReadBits(2) + 1; + u32 nPartitions = strm.ReadBits<2>() + 1; assert(nPartitions <= 4); if (nPartitions == 4 && weightParams.m_bDualPlane) { - assert(!"Dual plane mode is incompatible with four partition blocks"); + assert(false && "Dual plane mode is incompatible with four partition blocks"); FillError(outBuf, blockWidth, blockHeight); return; } - // Based on the number of partitions, read the color endpoint mode for + // Based on the number of partitions, read the color endpos32 mode for // each partition. - // Determine partitions, partition index, and color endpoint modes - int32_t planeIdx = -1; - uint32_t partitionIndex; - uint32_t colorEndpointMode[4] = {0, 0, 0, 0}; + // Determine partitions, partition index, and color endpos32 modes + s32 planeIdx = -1; + u32 partitionIndex; + u32 colorEndpos32Mode[4] = {0, 0, 0, 0}; // Define color data. - uint8_t colorEndpointData[16]; - memset(colorEndpointData, 0, sizeof(colorEndpointData)); - OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0); + u8 colorEndpos32Data[16]; + memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data)); + OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0); // Read extra config data... - uint32_t baseCEM = 0; + u32 baseCEM = 0; if (nPartitions == 1) { - colorEndpointMode[0] = strm.ReadBits(4); + colorEndpos32Mode[0] = strm.ReadBits<4>(); partitionIndex = 0; } else { - partitionIndex = strm.ReadBits(10); - baseCEM = strm.ReadBits(6); + partitionIndex = strm.ReadBits<10>(); + baseCEM = strm.ReadBits<6>(); } - uint32_t baseMode = (baseCEM & 3); + u32 baseMode = (baseCEM & 3); - // Remaining bits are color endpoint data... - uint32_t nWeightBits = weightParams.GetPackedBitSize(); - int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead(); + // Remaining bits are color endpos32 data... + u32 nWeightBits = weightParams.GetPackedBitSize(); + s32 remainingBits = 128 - nWeightBits - static_cast<s32>(strm.GetBitsRead()); // Consider extra bits prior to texel data... - uint32_t extraCEMbits = 0; + u32 extraCEMbits = 0; if (baseMode) { switch (nPartitions) { case 2: @@ -1473,18 +1454,18 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, remainingBits -= extraCEMbits; // Do we have a dual plane situation? - uint32_t planeSelectorBits = 0; + u32 planeSelectorBits = 0; if (weightParams.m_bDualPlane) { planeSelectorBits = 2; } remainingBits -= planeSelectorBits; // Read color data... - uint32_t colorDataBits = remainingBits; + u32 colorDataBits = remainingBits; while (remainingBits > 0) { - uint32_t nb = std::min(remainingBits, 8); - uint32_t b = strm.ReadBits(nb); - colorEndpointStream.WriteBits(b, nb); + u32 nb = std::min(remainingBits, 8); + u32 b = strm.ReadBits(nb); + colorEndpos32Stream.WriteBits(b, nb); remainingBits -= 8; } @@ -1493,64 +1474,64 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, // Read the rest of the CEM if (baseMode) { - uint32_t extraCEM = strm.ReadBits(extraCEMbits); - uint32_t CEM = (extraCEM << 6) | baseCEM; + u32 extraCEM = strm.ReadBits(extraCEMbits); + u32 CEM = (extraCEM << 6) | baseCEM; CEM >>= 2; bool C[4] = {0}; - for (uint32_t i = 0; i < nPartitions; i++) { + for (u32 i = 0; i < nPartitions; i++) { C[i] = CEM & 1; CEM >>= 1; } - uint8_t M[4] = {0}; - for (uint32_t i = 0; i < nPartitions; i++) { + u8 M[4] = {0}; + for (u32 i = 0; i < nPartitions; i++) { M[i] = CEM & 3; CEM >>= 2; assert(M[i] <= 3); } - for (uint32_t i = 0; i < nPartitions; i++) { - colorEndpointMode[i] = baseMode; + for (u32 i = 0; i < nPartitions; i++) { + colorEndpos32Mode[i] = baseMode; if (!(C[i])) - colorEndpointMode[i] -= 1; - colorEndpointMode[i] <<= 2; - colorEndpointMode[i] |= M[i]; + colorEndpos32Mode[i] -= 1; + colorEndpos32Mode[i] <<= 2; + colorEndpos32Mode[i] |= M[i]; } } else if (nPartitions > 1) { - uint32_t CEM = baseCEM >> 2; - for (uint32_t i = 0; i < nPartitions; i++) { - colorEndpointMode[i] = CEM; + u32 CEM = baseCEM >> 2; + for (u32 i = 0; i < nPartitions; i++) { + colorEndpos32Mode[i] = CEM; } } // Make sure everything up till here is sane. - for (uint32_t i = 0; i < nPartitions; i++) { - assert(colorEndpointMode[i] < 16); + for (u32 i = 0; i < nPartitions; i++) { + assert(colorEndpos32Mode[i] < 16); } assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); // Decode both color data and texel weight data - uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions - DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions, + u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions + DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions, colorDataBits); - Pixel endpoints[4][2]; - const uint32_t* colorValuesPtr = colorValues; - for (uint32_t i = 0; i < nPartitions; i++) { - ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]); + Pixel endpos32s[4][2]; + const u32* colorValuesPtr = colorValues; + for (u32 i = 0; i < nPartitions; i++) { + ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]); } // Read the texel weight data.. - uint8_t texelWeightData[16]; + u8 texelWeightData[16]; memcpy(texelWeightData, inBuf, sizeof(texelWeightData)); // Reverse everything - for (uint32_t i = 0; i < 8; i++) { + for (u32 i = 0; i < 8; i++) { // Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits #define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 - unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i])); - unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i])); + u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i])); + u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i])); #undef REVERSE_BYTE texelWeightData[i] = b; @@ -1558,50 +1539,51 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, } // Make sure that higher non-texel bits are set to zero - const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; + const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; texelWeightData[clearByteStart - 1] = texelWeightData[clearByteStart - 1] & - static_cast<uint8_t>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); + static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); std::vector<IntegerEncodedValue> texelWeightValues; + texelWeightValues.reserve(64); + InputBitStream weightStream(texelWeightData); - IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream, - weightParams.m_MaxWeight, - weightParams.GetNumWeightValues()); + DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, + weightParams.GetNumWeightValues()); // Blocks can be at most 12x12, so we can have as many as 144 weights - uint32_t weights[2][144]; + u32 weights[2][144]; UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); - // Now that we have endpoints and weights, we can interpolate and generate + // Now that we have endpos32s and weights, we can s32erpolate and generate // the proper decoding... - for (uint32_t j = 0; j < blockHeight; j++) - for (uint32_t i = 0; i < blockWidth; i++) { - uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions, - (blockHeight * blockWidth) < 32); + for (u32 j = 0; j < blockHeight; j++) + for (u32 i = 0; i < blockWidth; i++) { + u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions, + (blockHeight * blockWidth) < 32); assert(partition < nPartitions); Pixel p; - for (uint32_t c = 0; c < 4; c++) { - uint32_t C0 = endpoints[partition][0].Component(c); + for (u32 c = 0; c < 4; c++) { + u32 C0 = endpos32s[partition][0].Component(c); C0 = Replicate(C0, 8, 16); - uint32_t C1 = endpoints[partition][1].Component(c); + u32 C1 = endpos32s[partition][1].Component(c); C1 = Replicate(C1, 8, 16); - uint32_t plane = 0; + u32 plane = 0; if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { plane = 1; } - uint32_t weight = weights[plane][j * blockWidth + i]; - uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64; + u32 weight = weights[plane][j * blockWidth + i]; + u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64; if (C == 65535) { p.Component(c) = 255; } else { double Cf = static_cast<double>(C); - p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5); + p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5); } } @@ -1613,26 +1595,26 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, namespace Tegra::Texture::ASTC { -std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, - uint32_t depth, uint32_t block_width, uint32_t block_height) { - uint32_t blockIdx = 0; +std::vector<u8> Decompress(const u8* data, u32 width, u32 height, u32 depth, u32 block_width, + u32 block_height) { + u32 blockIdx = 0; std::size_t depth_offset = 0; - std::vector<uint8_t> outData(height * width * depth * 4); - for (uint32_t k = 0; k < depth; k++) { - for (uint32_t j = 0; j < height; j += block_height) { - for (uint32_t i = 0; i < width; i += block_width) { + std::vector<u8> outData(height * width * depth * 4); + for (u32 k = 0; k < depth; k++) { + for (u32 j = 0; j < height; j += block_height) { + for (u32 i = 0; i < width; i += block_width) { - const uint8_t* blockPtr = data + blockIdx * 16; + const u8* blockPtr = data + blockIdx * 16; // Blocks can be at most 12x12 - uint32_t uncompData[144]; + u32 uncompData[144]; ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - uint32_t decompWidth = std::min(block_width, width - i); - uint32_t decompHeight = std::min(block_height, height - j); + u32 decompWidth = std::min(block_width, width - i); + u32 decompHeight = std::min(block_height, height - j); - uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4; - for (uint32_t jj = 0; jj < decompHeight; jj++) { + u8* outRow = depth_offset + outData.data() + (j * width + i) * 4; + for (u32 jj = 0; jj < decompHeight; jj++) { memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); } diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 8e82c6748..7edc4abe1 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -8,6 +8,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "common/common_types.h" +#include "core/settings.h" namespace Tegra::Texture { @@ -294,6 +295,14 @@ enum class TextureMipmapFilter : u32 { Linear = 3, }; +enum class Anisotropy { + Default, + Filter2x, + Filter4x, + Filter8x, + Filter16x, +}; + struct TSCEntry { union { struct { @@ -328,7 +337,22 @@ struct TSCEntry { }; float GetMaxAnisotropy() const { - return static_cast<float>(1U << max_anisotropy); + const u32 min_value = [] { + switch (static_cast<Anisotropy>(Settings::values.max_anisotropy)) { + default: + case Anisotropy::Default: + return 1U; + case Anisotropy::Filter2x: + return 2U; + case Anisotropy::Filter4x: + return 4U; + case Anisotropy::Filter8x: + return 8U; + case Anisotropy::Filter16x: + return 16U; + } + }(); + return static_cast<float>(std::max(1U << max_anisotropy, min_value)); } float GetMinLod() const { |