From 8a3411b417f76db786b1d3cfffbd90926abb20ca Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 27 Mar 2022 05:05:57 +0200 Subject: Engines: Implement Accelerate DMA Texture. --- src/video_core/texture_cache/image_info.cpp | 45 ++++++++++- src/video_core/texture_cache/image_info.h | 2 + src/video_core/texture_cache/texture_cache.h | 69 ++++++++++++++++ src/video_core/texture_cache/texture_cache_base.h | 5 ++ src/video_core/texture_cache/types.h | 1 + src/video_core/texture_cache/util.cpp | 98 ++++++++++++++++++++--- src/video_core/texture_cache/util.h | 10 +++ 7 files changed, 218 insertions(+), 12 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index e9100091e..a1296b574 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -216,10 +216,51 @@ ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { .height = config.height, .depth = 1, }; - rescaleable = block.depth == 0; - rescaleable &= size.height > 256; + rescaleable = block.depth == 0 && size.height > 256; downscaleable = size.height > 512; } } +static PixelFormat ByteSizeToFormat(u32 bytes_per_pixel) { + switch (bytes_per_pixel) { + case 1: + return PixelFormat::R8_UINT; + case 2: + return PixelFormat::R8G8_UINT; + case 4: + return PixelFormat::A8B8G8R8_UINT; + case 8: + return PixelFormat::R16G16B16A16_UINT; + case 16: + return PixelFormat::R32G32B32A32_UINT; + default: + UNIMPLEMENTED(); + return PixelFormat::Invalid; + } +} + +ImageInfo::ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept { + const u32 bytes_per_pixel = config.bytes_per_pixel; + format = ByteSizeToFormat(bytes_per_pixel); + type = config.params.block_size.depth > 0 ? ImageType::e3D : ImageType::e2D; + num_samples = 1; + block = Extent3D{ + .width = config.params.block_size.width, + .height = config.params.block_size.height, + .depth = config.params.block_size.depth, + }; + size = Extent3D{ + .width = config.params.width, + .height = config.params.height, + .depth = config.params.depth, + }; + tile_width_spacing = 0; + resources.levels = 1; + resources.layers = 1; + layer_stride = CalculateLayerStride(*this); + maybe_unaligned_layer_stride = CalculateLayerSize(*this); + rescaleable = block.depth == 0 && size.height > 256; + downscaleable = size.height > 512; +} + } // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 93755e15e..a12f5b44f 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -5,6 +5,7 @@ #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/maxwell_dma.h" #include "video_core/surface.h" #include "video_core/texture_cache/types.h" @@ -19,6 +20,7 @@ struct ImageInfo { explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept; + explicit ImageInfo(const Tegra::DMA::ImageOperand& config) noexcept; PixelFormat format = PixelFormat::Invalid; ImageType type = ImageType::e1D; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 9dd152fbe..335338434 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1358,6 +1358,75 @@ std::optional::BlitImages> TextureCache

::GetBlitImag }}; } +template +ImageId TextureCache

::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) { + std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info)); + if (!cpu_addr) { + return ImageId{}; + } + } + ImageId image_id{}; + boost::container::small_vector image_ids; + const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (True(existing_image.flags & ImageFlagBits::Remapped)) { + return false; + } + if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) + [[unlikely]] { + const bool strict_size = True(existing_image.flags & ImageFlagBits::Strong); + const ImageInfo& existing = existing_image.info; + if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && + existing.pitch == info.pitch && + IsPitchLinearSameSize(existing, info, strict_size) && + IsViewCompatible(existing.format, info.format, false, true)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + } else if (IsSubCopy(info, existing_image, gpu_addr)) { + image_id = existing_image_id; + image_ids.push_back(existing_image_id); + return true; + } + return false; + }; + ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); + if (image_ids.size() <= 1) [[likely]] { + return image_id; + } + auto image_ids_compare = [this](ImageId a, ImageId b) { + auto& image_a = slot_images[a]; + auto& image_b = slot_images[b]; + return image_a.modification_tick < image_b.modification_tick; + }; + return *std::ranges::max_element(image_ids, image_ids_compare); +} + +template +std::optional::Image*, std::pair>> +TextureCache

::ObtainImage(const Tegra::DMA::ImageOperand& operand, bool mark_as_modified) { + ImageInfo dst_info(operand); + ImageId dst_id = FindDMAImage(dst_info, operand.address); + if (!dst_id) { + return std::nullopt; + } + auto& image = slot_images[dst_id]; + auto base = image.TryFindBase(operand.address); + if (!base) { + return std::nullopt; + } + if (False(image.flags & ImageFlagBits::GpuModified)) { + // No need to waste time on an image that's synced with guest + return std::nullopt; + } + PrepareImage(dst_id, mark_as_modified, false); + auto& new_image = slot_images[dst_id]; + lru_cache.Touch(new_image.lru_index, frame_tick); + return std::make_pair(&new_image, std::make_pair(base->level, base->layer)); +} + template SamplerId TextureCache

::FindSampler(const TSCEntry& config) { if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 013836933..848a5d9ea 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -209,6 +209,9 @@ public: /// Pop asynchronous downloads void PopAsyncFlushes(); + [[nodiscard]] std::optional>> ObtainImage( + const Tegra::DMA::ImageOperand& operand, bool mark_as_modified); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -300,6 +303,8 @@ private: /// Remove joined images from the cache [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + [[nodiscard]] ImageId FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr); + /// Return a blit image pair from the given guest blit parameters [[nodiscard]] std::optional GetBlitImages( const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src, diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index 0453456b4..a0e10643f 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -54,6 +54,7 @@ enum class RelaxedOptions : u32 { Format = 1 << 1, Samples = 1 << 2, ForceBrokenViews = 1 << 3, + FormatBpp = 1 << 4, }; DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 697f86641..de37db684 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -743,6 +743,44 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } +std::vector MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, + u32 down_shift) { + std::vector copies; + copies.reserve(src.resources.levels); + const bool is_3d = src.type == ImageType::e3D; + for (s32 level = 0; level < src.resources.levels; ++level) { + ImageCopy& copy = copies.emplace_back(); + copy.src_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.dst_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.src_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + copy.dst_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + const Extent3D mip_size = AdjustMipSize(src.size, level); + copy.extent = AdjustSamplesSize(mip_size, src.num_samples); + if (is_3d) { + copy.extent.depth = src.size.depth; + } + copy.extent.width = std::max((copy.extent.width * up_scale) >> down_shift, 1); + copy.extent.height = std::max((copy.extent.height * up_scale) >> down_shift, 1); + } + return copies; +} + bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { const GPUVAddr address = config.Address(); if (address == 0) { @@ -999,6 +1037,20 @@ bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32 } } +bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, + u32 lhs_level, u32 rhs_level) noexcept { + ASSERT(lhs.type != ImageType::Linear); + ASSERT(rhs.type != ImageType::Linear); + const auto lhs_bpp = BytesPerBlock(lhs.format); + const auto rhs_bpp = BytesPerBlock(rhs.format); + const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level); + const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level); + return Common::AlignUpLog2(lhs_size.width * lhs_bpp, GOB_SIZE_X_SHIFT) == + Common::AlignUpLog2(rhs_size.width * rhs_bpp, GOB_SIZE_X_SHIFT) && + Common::AlignUpLog2(lhs_size.height, GOB_SIZE_Y_SHIFT) == + Common::AlignUpLog2(rhs_size.height, GOB_SIZE_Y_SHIFT); +} + bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept { ASSERT(lhs.type == ImageType::Linear); ASSERT(rhs.type == ImageType::Linear); @@ -1073,7 +1125,8 @@ std::optional FindSubresource(const ImageInfo& candidate, const // Format checking is relaxed, but we still have to check for matching bytes per block. // This avoids creating a view for blits on UE4 titles where formats with different bytes // per block are aliased. - if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) { + if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) && + False(options & RelaxedOptions::FormatBpp)) { return std::nullopt; } } else { @@ -1088,10 +1141,8 @@ std::optional FindSubresource(const ImageInfo& candidate, const if (existing.type != candidate.type) { return std::nullopt; } - if (False(options & RelaxedOptions::Samples)) { - if (existing.num_samples != candidate.num_samples) { - return std::nullopt; - } + if (False(options & RelaxedOptions::Samples) && existing.num_samples != candidate.num_samples) { + return std::nullopt; } if (existing.resources.levels < candidate.resources.levels + base->level) { return std::nullopt; @@ -1101,14 +1152,16 @@ std::optional FindSubresource(const ImageInfo& candidate, const if (mip_depth < candidate.size.depth + base->layer) { return std::nullopt; } - } else { - if (existing.resources.layers < candidate.resources.layers + base->layer) { - return std::nullopt; - } + } else if (existing.resources.layers < candidate.resources.layers + base->layer) { + return std::nullopt; } const bool strict_size = False(options & RelaxedOptions::Size); if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { - return std::nullopt; + if (False(options & RelaxedOptions::FormatBpp)) { + return std::nullopt; + } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { + return std::nullopt; + } } // TODO: compare block sizes return base; @@ -1120,6 +1173,31 @@ bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr .has_value(); } +bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr) { + const std::optional base = image.TryFindBase(candidate_addr); + if (!base) { + return false; + } + const ImageInfo& existing = image.info; + if (existing.resources.levels < candidate.resources.levels + base->level) { + return false; + } + if (existing.type == ImageType::e3D) { + const u32 mip_depth = std::max(1U, existing.size.depth << base->level); + if (mip_depth < candidate.size.depth + base->layer) { + return false; + } + } else { + if (existing.resources.layers < candidate.resources.layers + base->layer) { + return false; + } + } + if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { + return false; + } + return true; +} + void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, const ImageBase* src) { const auto original_dst_format = dst_info.format; diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index d103db8ae..84aa6880d 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -56,6 +56,10 @@ struct OverlapResult { SubresourceBase base, u32 up_scale = 1, u32 down_shift = 0); +[[nodiscard]] std::vector MakeReinterpretImageCopies(const ImageInfo& src, + u32 up_scale = 1, + u32 down_shift = 0); + [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); [[nodiscard]] std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, @@ -88,6 +92,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima [[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept; +[[nodiscard]] bool IsBlockLinearSizeCompatibleBPPRelaxed(const ImageInfo& lhs, const ImageInfo& rhs, + u32 lhs_level, u32 rhs_level) noexcept; + [[nodiscard]] std::optional ResolveOverlap(const ImageInfo& new_info, GPUVAddr gpu_addr, VAddr cpu_addr, const ImageBase& overlap, @@ -106,6 +113,9 @@ void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const Ima GPUVAddr candidate_addr, RelaxedOptions options, bool broken_views, bool native_bgr); +[[nodiscard]] bool IsSubCopy(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr); + void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, const ImageBase* src); -- cgit v1.2.3