7 files changed, 79 insertions, 12 deletions
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 9e480dc39..eff6abd55 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -389,6 +389,13 @@ public:
                 ReverseSubtract = 3,
                 Min = 4,
                 Max = 5,
+
+                // These values are used by Nouveau and some games.
+                AddGL = 0x8006,
+                SubtractGL = 0x8007,
+                ReverseSubtractGL = 0x8008,
+                MinGL = 0x800a,
+                MaxGL = 0x800b
             };
 
             enum class Factor : u32 {
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 335a8d407..2b0dea5cd 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -35,6 +35,7 @@ void MacroInterpreter::Reset() {
     // The next parameter index starts at 1, because $r1 already has the value of the first
     // parameter.
     next_parameter_index = 1;
+    carry_flag = false;
 }
 
 bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
@@ -135,14 +136,28 @@ MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
     return {macro_memory[offset + pc / sizeof(u32)]};
 }
 
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const {
+u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
     switch (operation) {
-    case ALUOperation::Add:
-        return src_a + src_b;
-    // TODO(Subv): Implement AddWithCarry
-    case ALUOperation::Subtract:
-        return src_a - src_b;
-    // TODO(Subv): Implement SubtractWithBorrow
+    case ALUOperation::Add: {
+        const u64 result{static_cast<u64>(src_a) + src_b};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case ALUOperation::AddWithCarry: {
+        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case ALUOperation::Subtract: {
+        const u64 result{static_cast<u64>(src_a) - src_b};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
+    case ALUOperation::SubtractWithBorrow: {
+        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
     case ALUOperation::Xor:
         return src_a ^ src_b;
     case ALUOperation::Or:
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h
index 62d1ce289..cde360288 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro_interpreter.h
@@ -117,7 +117,7 @@ private:
     bool Step(u32 offset, bool is_delay_slot);
 
     /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const;
+    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
 
     /// Performs the result operation on the input result and stores it in the specified register
     /// (if necessary).
@@ -165,5 +165,7 @@ private:
     std::vector<u32> parameters;
     /// Index of the next parameter that will be fetched by the 'parm' instruction.
     u32 next_parameter_index = 0;
+
+    bool carry_flag{};
 };
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 9ca82c06c..b994e89dd 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -1275,6 +1275,31 @@ Surface RasterizerCacheOpenGL::GetUncachedSurface(const SurfaceParams& params) {
     return surface;
 }
 
+void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
+                                                   const Surface& dst_surface) {
+    const auto& init_params{src_surface->GetSurfaceParams()};
+    const auto& dst_params{dst_surface->GetSurfaceParams()};
+    VAddr address = init_params.addr;
+    const std::size_t layer_size = dst_params.LayerMemorySize();
+    for (u32 layer = 0; layer < dst_params.depth; layer++) {
+        for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) {
+            const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap);
+            const Surface& copy = TryGet(sub_address);
+            if (!copy)
+                continue;
+            const auto& src_params{copy->GetSurfaceParams()};
+            const u32 width{std::min(src_params.width, dst_params.MipWidth(mipmap))};
+            const u32 height{std::min(src_params.height, dst_params.MipHeight(mipmap))};
+
+            glCopyImageSubData(copy->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0,
+                               0, 0, dst_surface->Texture().handle,
+                               SurfaceTargetToGL(dst_params.target), mipmap, 0, 0, layer, width,
+                               height, 1);
+        }
+        address += layer_size;
+    }
+}
+
 void RasterizerCacheOpenGL::FermiCopySurface(
     const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
     const Tegra::Engines::Fermi2D::Regs::Surface& dst_config) {
@@ -1340,11 +1365,13 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
             CopySurface(old_surface, new_surface, copy_pbo.handle);
         }
         break;
-    case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::Texture3D:
+        AccurateCopySurface(old_surface, new_surface);
+        break;
+    case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubeArray:
-        AccurateCopySurface(old_surface, new_surface);
+        FastLayeredCopySurface(old_surface, new_surface);
         break;
     default:
         LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 494f6b903..9ac79c5a4 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -350,6 +350,7 @@ private:
 
     /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data
     void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface);
+    void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface);
 
     /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have
     /// previously been used. This is to prevent surfaces from being constantly created and
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f4fa6c74a..dd406b132 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2903,6 +2903,8 @@ private:
                 UNIMPLEMENTED_IF_MSG(instr.txq.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
                                      "NODEP is not implemented");
 
+                ++shader.scope;
+                shader.AddLine('{');
                 // TODO: the new commits on the texture refactor, change the way samplers work.
                 // Sadly, not all texture instructions specify the type of texture their sampler
                 // uses. This must be fixed at a later instance.
@@ -2910,8 +2912,14 @@ private:
                     GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);
                 switch (instr.txq.query_type) {
                 case Tegra::Shader::TextureQueryType::Dimension: {
-                    const std::string texture = "textureQueryLevels(" + sampler + ')';
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0, texture, 1, 1);
+                    const std::string texture = "textureSize(" + sampler + ", " +
+                                                regs.GetRegisterAsInteger(instr.gpr8) + ')';
+                    const std::string mip_level = "textureQueryLevels(" + sampler + ')';
+                    shader.AddLine("ivec2 sizes = " + texture + ';');
+                    regs.SetRegisterToInteger(instr.gpr0, true, 0, "sizes.x", 1, 1);
+                    regs.SetRegisterToInteger(instr.gpr0.Value() + 1, true, 0, "sizes.y", 1, 1);
+                    regs.SetRegisterToInteger(instr.gpr0.Value() + 2, true, 0, "0", 1, 1);
+                    regs.SetRegisterToInteger(instr.gpr0.Value() + 3, true, 0, mip_level, 1, 1);
                     break;
                 }
                 default: {
@@ -2919,6 +2927,8 @@ private:
                                       static_cast<u32>(instr.txq.query_type.Value()));
                 }
                 }
+                --shader.scope;
+                shader.AddLine('}');
                 break;
             }
             case OpCode::Id::TMML: {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 065b3929c..a8833c06e 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -218,14 +218,19 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
 inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     switch (equation) {
     case Maxwell::Blend::Equation::Add:
+    case Maxwell::Blend::Equation::AddGL:
         return GL_FUNC_ADD;
     case Maxwell::Blend::Equation::Subtract:
+    case Maxwell::Blend::Equation::SubtractGL:
         return GL_FUNC_SUBTRACT;
     case Maxwell::Blend::Equation::ReverseSubtract:
+    case Maxwell::Blend::Equation::ReverseSubtractGL:
         return GL_FUNC_REVERSE_SUBTRACT;
     case Maxwell::Blend::Equation::Min:
+    case Maxwell::Blend::Equation::MinGL:
         return GL_MIN;
     case Maxwell::Blend::Equation::Max:
+    case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
     LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));