9 files changed, 49 insertions, 31 deletions
diff --git a/.travis-build.sh b/.travis-build.sh
index 3310bacc5..df6e236b6 100755
--- a/.travis-build.sh
+++ b/.travis-build.sh
@@ -12,7 +12,7 @@ fi
 # Only run clang-format on Linux because we don't have 4.0 on OS X images
 if [ "$TRAVIS_OS_NAME" = "linux" ]; then
     # Default clang-format points to default 3.5 version one
-    CLANG_FORMAT=clang-format-4.0
+    CLANG_FORMAT=clang-format-3.9
     $CLANG_FORMAT --version
 
     if [ "$TRAVIS_EVENT_TYPE" = "pull_request" ]; then
diff --git a/.travis-deps.sh b/.travis-deps.sh
index 9fd21cc57..1404fe19f 100755
--- a/.travis-deps.sh
+++ b/.travis-deps.sh
@@ -27,6 +27,13 @@ if [ "$TRAVIS_OS_NAME" = "linux" -o -z "$TRAVIS_OS_NAME" ]; then
         echo "Using cached SDL2"
     fi
 
+    export DEBIAN_FRONTEND=noninteractive
+    # Amazing placebo security
+    curl http://apt.llvm.org/llvm-snapshot.gpg.key | sudo -E apt-key add -
+    sudo -E add-apt-repository "deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-3.9 main"
+    sudo -E apt-get -yq update
+    sudo -E apt-get -yq install clang-format-3.9
+
 elif [ "$TRAVIS_OS_NAME" = "osx" ]; then
     brew update
     brew install qt5 sdl2 dylibbundler
diff --git a/.travis.yml b/.travis.yml
index a9e7aadd2..cc34e039c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,7 +3,7 @@ language: cpp
 matrix:
   include:
     - os: linux
-      sudo: true
+      sudo: required
       dist: trusty
     - os: osx
       sudo: false
@@ -17,7 +17,6 @@ addons:
   apt:
     sources:
       - ubuntu-toolchain-r-test
-      - llvm-toolchain-precise
     packages:
       - gcc-6
       - g++-6
@@ -25,7 +24,6 @@ addons:
       - libqt5opengl5-dev
       - xorg-dev
       - lib32stdc++6 # For CMake
-      - clang-format-4.0
 
 cache:
   directories:
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 710e0e485..78cb761be 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -346,7 +346,7 @@ static void SetAxiConfigQoSMode(Service::Interface* self) {
 
     cmd_buff[1] = RESULT_SUCCESS.raw; // No error
 
-    LOG_WARNING(Service_GSP, "(STUBBED) called mode=0x%08X", mode);
+    LOG_DEBUG(Service_GSP, "(STUBBED) called mode=0x%08X", mode);
 }
 
 /**
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 99bd59a69..b2db609ec 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -40,7 +40,7 @@ namespace Pica {
 //       field offset. Otherwise, the compiler will fail to compile this code.
 #define PICA_REG_INDEX_WORKAROUND(field_name, backup_workaround_index)                             \
     ((typename std::enable_if<backup_workaround_index == PICA_REG_INDEX(field_name),               \
-                              size_t>::type) PICA_REG_INDEX(field_name))
+                              size_t>::type)PICA_REG_INDEX(field_name))
 #endif // _MSC_VER
 
 struct Regs {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 1b734aaa5..3f2255e06 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -715,7 +715,11 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
 
     CachedSurface src_params;
     src_params.addr = config.GetPhysicalInputAddress();
-    src_params.width = config.output_width;
+    // It's important to use the correct source input width to properly skip over parts of the input
+    // image which will be cropped from the output but still affect the stride of the input image.
+    src_params.width = config.input_width;
+    // Using the output's height is fine because we don't read or skip over the remaining part of
+    // the image, and it allows for smaller texture cache lookup rectangles.
     src_params.height = config.output_height;
     src_params.is_tiled = !config.input_linear;
     src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.input_format);
@@ -736,6 +740,11 @@ bool RasterizerOpenGL::AccelerateDisplayTransfer(const GPU::Regs::DisplayTransfe
         return false;
     }
 
+    // Adjust the source rectangle to take into account parts of the input lines being cropped
+    if (config.input_width > config.output_width) {
+        src_rect.right -= (config.input_width - config.output_width) * src_surface->res_scale_width;
+    }
+
     // Require destination surface to have same resolution scale as source to preserve scaling
     dst_params.res_scale_width = src_surface->res_scale_width;
     dst_params.res_scale_height = src_surface->res_scale_height;
@@ -938,7 +947,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const GPU::Regs::FramebufferConfig& con
     src_params.addr = framebuffer_addr;
     src_params.width = config.width;
     src_params.height = config.height;
-    src_params.stride = pixel_stride;
+    src_params.pixel_stride = pixel_stride;
     src_params.is_tiled = false;
     src_params.pixel_format = CachedSurface::PixelFormatFromGPUPixelFormat(config.color_format);
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 5cbad9b43..61f6e767f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -158,24 +158,21 @@ bool RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
         buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
     }
 
-    if (OpenGLState::CheckFBStatus(GL_READ_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-        return false;
-    }
+    bool can_blit = OpenGLState::CheckFBStatus(GL_READ_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE &&
+                    OpenGLState::CheckFBStatus(GL_DRAW_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE;
 
-    if (OpenGLState::CheckFBStatus(GL_DRAW_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-        return false;
+    if (can_blit) {
+        glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom,
+                          dst_rect.left, dst_rect.top, dst_rect.right, dst_rect.bottom, buffers,
+                          buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
     }
 
-    glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left,
-                      dst_rect.top, dst_rect.right, dst_rect.bottom, buffers,
-                      buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
-
     // Restore previous framebuffer bindings
     cur_state.draw.read_framebuffer = old_fbs[0];
     cur_state.draw.draw_framebuffer = old_fbs[1];
     cur_state.Apply();
 
-    return true;
+    return can_blit;
 }
 
 bool RasterizerCacheOpenGL::TryBlitSurfaces(CachedSurface* src_surface,
@@ -291,6 +288,9 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
 
     MICROPROFILE_SCOPE(OpenGL_SurfaceUpload);
 
+    // Stride only applies to linear images.
+    ASSERT(params.pixel_stride == 0 || !params.is_tiled);
+
     std::shared_ptr<CachedSurface> new_surface = std::make_shared<CachedSurface>();
 
     new_surface->addr = params.addr;
@@ -299,7 +299,7 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
     new_surface->texture.Create();
     new_surface->width = params.width;
     new_surface->height = params.height;
-    new_surface->stride = params.stride;
+    new_surface->pixel_stride = params.pixel_stride;
     new_surface->res_scale_width = params.res_scale_width;
     new_surface->res_scale_height = params.res_scale_height;
 
@@ -325,14 +325,15 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
         cur_state.Apply();
         glActiveTexture(GL_TEXTURE0);
 
-        glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->stride);
         if (!new_surface->is_tiled) {
             // TODO: Ensure this will always be a color format, not a depth or other format
             ASSERT((size_t)new_surface->pixel_format < fb_format_tuples.size());
             const FormatTuple& tuple = fb_format_tuples[(unsigned int)params.pixel_format];
 
+            glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)new_surface->pixel_stride);
             glTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format, params.width, params.height, 0,
                          tuple.format, tuple.type, texture_src_data);
+            glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
         } else {
             SurfaceType type = CachedSurface::GetFormatType(new_surface->pixel_format);
             if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
@@ -391,7 +392,6 @@ CachedSurface* RasterizerCacheOpenGL::GetSurface(const CachedSurface& params, bo
                              0, tuple.format, tuple.type, temp_fb_depth_buffer.data());
             }
         }
-        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 
         // If not 1x scale, blit 1x texture to a new scaled texture and replace texture in surface
         if (new_surface->res_scale_width != 1.f || new_surface->res_scale_height != 1.f) {
@@ -701,13 +701,14 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
     cur_state.Apply();
     glActiveTexture(GL_TEXTURE0);
 
-    glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->stride);
     if (!surface->is_tiled) {
         // TODO: Ensure this will always be a color format, not a depth or other format
         ASSERT((size_t)surface->pixel_format < fb_format_tuples.size());
         const FormatTuple& tuple = fb_format_tuples[(unsigned int)surface->pixel_format];
 
+        glPixelStorei(GL_PACK_ROW_LENGTH, (GLint)surface->pixel_stride);
         glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, dst_buffer);
+        glPixelStorei(GL_PACK_ROW_LENGTH, 0);
     } else {
         SurfaceType type = CachedSurface::GetFormatType(surface->pixel_format);
         if (type != SurfaceType::Depth && type != SurfaceType::DepthStencil) {
@@ -750,7 +751,6 @@ void RasterizerCacheOpenGL::FlushSurface(CachedSurface* surface) {
                              false);
         }
     }
-    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
 
     surface->dirty = false;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 849530d86..32abfbaf5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -171,7 +171,8 @@ struct CachedSurface {
     OGLTexture texture;
     u32 width;
     u32 height;
-    u32 stride = 0;
+    /// Stride between lines, in pixels. Only valid for images in linear format.
+    u32 pixel_stride = 0;
     float res_scale_width = 1.f;
     float res_scale_height = 1.f;
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 211c703ab..c96110bb2 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -102,11 +102,11 @@ static const X64Reg SETUP = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
-/// VS loop count register
+/// VS loop count register (Multiplied by 16)
 static const X64Reg LOOPCOUNT_REG = R12;
 /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
 static const X64Reg LOOPCOUNT = RSI;
-/// Number to increment LOOPCOUNT_REG by on each loop iteration
+/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
 static const X64Reg LOOPINC = RDI;
 /// Result of the previous CMP instruction for the X-component comparison
 static const X64Reg COND0 = R13;
@@ -491,7 +491,7 @@ void JitShader::Compile_FLR(Instruction instr) {
     if (Common::GetCPUCaps().sse4_1) {
         ROUNDFLOORPS(SRC1, R(SRC1));
     } else {
-        CVTPS2DQ(SRC1, R(SRC1));
+        CVTTPS2DQ(SRC1, R(SRC1));
         CVTDQ2PS(SRC1, R(SRC1));
     }
 
@@ -718,15 +718,18 @@ void JitShader::Compile_LOOP(Instruction instr) {
 
     looping = true;
 
+    // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
+    // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
+    // 4 bits) to be used as an offset into the 16-byte vector registers later
     int offset =
         ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
     MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
-    SHR(32, R(LOOPCOUNT_REG), Imm8(8));
-    AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
+    SHR(32, R(LOOPCOUNT_REG), Imm8(4));
+    AND(32, R(LOOPCOUNT_REG), Imm32(0xFF0)); // Y-component is the start
     MOV(32, R(LOOPINC), R(LOOPCOUNT));
-    SHR(32, R(LOOPINC), Imm8(16));
-    MOVZX(32, 8, LOOPINC, R(LOOPINC));     // Z-component is the incrementer
+    SHR(32, R(LOOPINC), Imm8(12));
+    AND(32, R(LOOPINC), Imm32(0xFF0));     // Z-component is the incrementer
     MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count
     ADD(32, R(LOOPCOUNT), Imm8(1));        // Iteration count is X-component + 1