summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/audio_core/CMakeLists.txt7
-rw-r--r--src/audio_core/audio_core.h2
-rw-r--r--src/audio_core/hle/common.h9
-rw-r--r--src/audio_core/hle/dsp.h12
-rw-r--r--src/audio_core/interpolate.cpp85
-rw-r--r--src/audio_core/interpolate.h41
-rw-r--r--src/citra_qt/bootmanager.cpp2
-rw-r--r--src/citra_qt/debugger/profiler.cpp39
-rw-r--r--src/citra_qt/debugger/profiler.h3
-rw-r--r--src/citra_qt/main.cpp9
-rw-r--r--src/common/CMakeLists.txt1
-rw-r--r--src/common/microprofile.h4
-rw-r--r--src/common/microprofileui.h3
-rw-r--r--src/common/profiler.cpp82
-rw-r--r--src/common/profiler.h152
-rw-r--r--src/common/profiler_reporting.h27
-rw-r--r--src/core/arm/dyncom/arm_dyncom_interpreter.cpp7
-rw-r--r--src/core/hle/service/gsp_gpu.cpp1
-rw-r--r--src/core/hle/svc.cpp4
-rw-r--r--src/video_core/CMakeLists.txt2
-rw-r--r--src/video_core/command_processor.cpp133
-rw-r--r--src/video_core/debug_utils/debug_utils.h30
-rw-r--r--src/video_core/rasterizer.cpp3
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp1
-rw-r--r--src/video_core/shader/shader.cpp3
-rw-r--r--src/video_core/shader/shader.h2
-rw-r--r--src/video_core/vertex_loader.cpp140
-rw-r--r--src/video_core/vertex_loader.h28
28 files changed, 391 insertions, 441 deletions
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index 869da5e83..a965af291 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SRCS
hle/dsp.cpp
hle/filter.cpp
hle/pipe.cpp
+ interpolate.cpp
)
set(HEADERS
@@ -13,9 +14,13 @@ set(HEADERS
hle/dsp.h
hle/filter.h
hle/pipe.h
+ interpolate.h
sink.h
)
+include_directories(../../externals/soundtouch/include)
+
create_directory_groups(${SRCS} ${HEADERS})
-add_library(audio_core STATIC ${SRCS} ${HEADERS}) \ No newline at end of file
+add_library(audio_core STATIC ${SRCS} ${HEADERS})
+target_link_libraries(audio_core SoundTouch)
diff --git a/src/audio_core/audio_core.h b/src/audio_core/audio_core.h
index 64c330914..b349895ea 100644
--- a/src/audio_core/audio_core.h
+++ b/src/audio_core/audio_core.h
@@ -10,8 +10,6 @@ class VMManager;
namespace AudioCore {
-constexpr int num_sources = 24;
-constexpr int samples_per_frame = 160; ///< Samples per audio frame at native sample rate
constexpr int native_sample_rate = 32728; ///< 32kHz
/// Initialise Audio Core
diff --git a/src/audio_core/hle/common.h b/src/audio_core/hle/common.h
index 37d441eb2..7910f42ae 100644
--- a/src/audio_core/hle/common.h
+++ b/src/audio_core/hle/common.h
@@ -7,18 +7,19 @@
#include <algorithm>
#include <array>
-#include "audio_core/audio_core.h"
-
#include "common/common_types.h"
namespace DSP {
namespace HLE {
+constexpr int num_sources = 24;
+constexpr int samples_per_frame = 160; ///< Samples per audio frame at native sample rate
+
/// The final output to the speakers is stereo. Preprocessing output in Source is also stereo.
-using StereoFrame16 = std::array<std::array<s16, 2>, AudioCore::samples_per_frame>;
+using StereoFrame16 = std::array<std::array<s16, 2>, samples_per_frame>;
/// The DSP is quadraphonic internally.
-using QuadFrame32 = std::array<std::array<s32, 4>, AudioCore::samples_per_frame>;
+using QuadFrame32 = std::array<std::array<s32, 4>, samples_per_frame>;
/**
* This performs the filter operation defined by FilterT::ProcessSample on the frame in-place.
diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h
index f94ec9467..f0f125284 100644
--- a/src/audio_core/hle/dsp.h
+++ b/src/audio_core/hle/dsp.h
@@ -8,7 +8,7 @@
#include <cstddef>
#include <type_traits>
-#include "audio_core/audio_core.h"
+#include "audio_core/hle/common.h"
#include "common/bit_field.h"
#include "common/common_funcs.h"
@@ -305,7 +305,7 @@ struct SourceConfiguration {
u16_le buffer_id;
};
- Configuration config[AudioCore::num_sources];
+ Configuration config[num_sources];
};
ASSERT_DSP_STRUCT(SourceConfiguration::Configuration, 192);
ASSERT_DSP_STRUCT(SourceConfiguration::Configuration::Buffer, 20);
@@ -320,7 +320,7 @@ struct SourceStatus {
INSERT_PADDING_DSPWORDS(1);
};
- Status status[AudioCore::num_sources];
+ Status status[num_sources];
};
ASSERT_DSP_STRUCT(SourceStatus::Status, 12);
@@ -413,7 +413,7 @@ ASSERT_DSP_STRUCT(DspConfiguration::ReverbEffect, 52);
struct AdpcmCoefficients {
/// Coefficients are signed fixed point with 11 fractional bits.
/// Each source has 16 coefficients associated with it.
- s16_le coeff[AudioCore::num_sources][16];
+ s16_le coeff[num_sources][16];
};
ASSERT_DSP_STRUCT(AdpcmCoefficients, 768);
@@ -427,7 +427,7 @@ ASSERT_DSP_STRUCT(DspStatus, 32);
/// Final mixed output in PCM16 stereo format, what you hear out of the speakers.
/// When the application writes to this region it has no effect.
struct FinalMixSamples {
- s16_le pcm16[2 * AudioCore::samples_per_frame];
+ s16_le pcm16[2 * samples_per_frame];
};
ASSERT_DSP_STRUCT(FinalMixSamples, 640);
@@ -437,7 +437,7 @@ ASSERT_DSP_STRUCT(FinalMixSamples, 640);
/// Values that exceed s16 range will be clipped by the DSP after further processing.
struct IntermediateMixSamples {
struct Samples {
- s32_le pcm32[4][AudioCore::samples_per_frame]; ///< Little-endian as opposed to DSP middle-endian.
+ s32_le pcm32[4][samples_per_frame]; ///< Little-endian as opposed to DSP middle-endian.
};
Samples mix1;
diff --git a/src/audio_core/interpolate.cpp b/src/audio_core/interpolate.cpp
new file mode 100644
index 000000000..fcd3aa066
--- /dev/null
+++ b/src/audio_core/interpolate.cpp
@@ -0,0 +1,85 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "audio_core/interpolate.h"
+
+#include "common/assert.h"
+#include "common/math_util.h"
+
+namespace AudioInterp {
+
+// Calculations are done in fixed point with 24 fractional bits.
+// (This is not verified. This was chosen for minimal error.)
+constexpr u64 scale_factor = 1 << 24;
+constexpr u64 scale_mask = scale_factor - 1;
+
+/// Here we step over the input in steps of rate_multiplier, until we consume all of the input.
+/// Three adjacent samples are passed to fn each step.
+template <typename Function>
+static StereoBuffer16 StepOverSamples(State& state, const StereoBuffer16& input, float rate_multiplier, Function fn) {
+ ASSERT(rate_multiplier > 0);
+
+ if (input.size() < 2)
+ return {};
+
+ StereoBuffer16 output;
+ output.reserve(static_cast<size_t>(input.size() / rate_multiplier));
+
+ u64 step_size = static_cast<u64>(rate_multiplier * scale_factor);
+
+ u64 fposition = 0;
+ const u64 max_fposition = input.size() * scale_factor;
+
+ while (fposition < 1 * scale_factor) {
+ u64 fraction = fposition & scale_mask;
+
+ output.push_back(fn(fraction, state.xn2, state.xn1, input[0]));
+
+ fposition += step_size;
+ }
+
+ while (fposition < 2 * scale_factor) {
+ u64 fraction = fposition & scale_mask;
+
+ output.push_back(fn(fraction, state.xn1, input[0], input[1]));
+
+ fposition += step_size;
+ }
+
+ while (fposition < max_fposition) {
+ u64 fraction = fposition & scale_mask;
+
+ size_t index = static_cast<size_t>(fposition / scale_factor);
+ output.push_back(fn(fraction, input[index - 2], input[index - 1], input[index]));
+
+ fposition += step_size;
+ }
+
+ state.xn2 = input[input.size() - 2];
+ state.xn1 = input[input.size() - 1];
+
+ return output;
+}
+
+StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier) {
+ return StepOverSamples(state, input, rate_multiplier, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
+ return x0;
+ });
+}
+
+StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier) {
+ // Note on accuracy: Some values that this produces are +/- 1 from the actual firmware.
+ return StepOverSamples(state, input, rate_multiplier, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
+ // This is a saturated subtraction. (Verified by black-box fuzzing.)
+ s64 delta0 = MathUtil::Clamp<s64>(x1[0] - x0[0], -32768, 32767);
+ s64 delta1 = MathUtil::Clamp<s64>(x1[1] - x0[1], -32768, 32767);
+
+ return std::array<s16, 2> {
+ static_cast<s16>(x0[0] + fraction * delta0 / scale_factor),
+ static_cast<s16>(x0[1] + fraction * delta1 / scale_factor)
+ };
+ });
+}
+
+} // namespace AudioInterp
diff --git a/src/audio_core/interpolate.h b/src/audio_core/interpolate.h
new file mode 100644
index 000000000..a4c0a453d
--- /dev/null
+++ b/src/audio_core/interpolate.h
@@ -0,0 +1,41 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace AudioInterp {
+
+/// A variable length buffer of signed PCM16 stereo samples.
+using StereoBuffer16 = std::vector<std::array<s16, 2>>;
+
+struct State {
+ // Two historical samples.
+ std::array<s16, 2> xn1 = {}; ///< x[n-1]
+ std::array<s16, 2> xn2 = {}; ///< x[n-2]
+};
+
+/**
+ * No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay.
+ * @param input Input buffer.
+ * @param rate_multiplier Stretch factor. Must be a positive non-zero value.
+ * rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0 performs upsampling.
+ * @return The resampled audio buffer.
+ */
+StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier);
+
+/**
+ * Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay.
+ * @param input Input buffer.
+ * @param rate_multiplier Stretch factor. Must be a positive non-zero value.
+ * rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0 performs upsampling.
+ * @return The resampled audio buffer.
+ */
+StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier);
+
+} // namespace AudioInterp
diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp
index 8e60b9cad..01b81c11c 100644
--- a/src/citra_qt/bootmanager.cpp
+++ b/src/citra_qt/bootmanager.cpp
@@ -71,7 +71,9 @@ void EmuThread::run() {
// Shutdown the core emulation
System::Shutdown();
+#if MICROPROFILE_ENABLED
MicroProfileOnThreadExit();
+#endif
render_window->moveContext();
}
diff --git a/src/citra_qt/debugger/profiler.cpp b/src/citra_qt/debugger/profiler.cpp
index 4f6ba0e1f..7bb010f77 100644
--- a/src/citra_qt/debugger/profiler.cpp
+++ b/src/citra_qt/debugger/profiler.cpp
@@ -9,13 +9,16 @@
#include "citra_qt/debugger/profiler.h"
#include "citra_qt/util/util.h"
+#include "common/common_types.h"
#include "common/microprofile.h"
#include "common/profiler_reporting.h"
// Include the implementation of the UI in this file. This isn't in microprofile.cpp because the
// non-Qt frontends don't need it (and don't implement the UI drawing hooks either).
+#if MICROPROFILE_ENABLED
#define MICROPROFILEUI_IMPL 1
#include "common/microprofileui.h"
+#endif
using namespace Common::Profiling;
@@ -34,21 +37,9 @@ static QVariant GetDataForColumn(int col, const AggregatedDuration& duration)
}
}
-static const TimingCategoryInfo* GetCategoryInfo(int id)
-{
- const auto& categories = GetProfilingManager().GetTimingCategoriesInfo();
- if ((size_t)id >= categories.size()) {
- return nullptr;
- } else {
- return &categories[id];
- }
-}
-
ProfilerModel::ProfilerModel(QObject* parent) : QAbstractItemModel(parent)
{
updateProfilingInfo();
- const auto& categories = GetProfilingManager().GetTimingCategoriesInfo();
- results.time_per_category.resize(categories.size());
}
QVariant ProfilerModel::headerData(int section, Qt::Orientation orientation, int role) const
@@ -85,7 +76,7 @@ int ProfilerModel::rowCount(const QModelIndex& parent) const
if (parent.isValid()) {
return 0;
} else {
- return static_cast<int>(results.time_per_category.size() + 2);
+ return 2;
}
}
@@ -104,17 +95,6 @@ QVariant ProfilerModel::data(const QModelIndex& index, int role) const
} else {
return GetDataForColumn(index.column(), results.interframe_time);
}
- } else {
- if (index.column() == 0) {
- const TimingCategoryInfo* info = GetCategoryInfo(index.row() - 2);
- return info != nullptr ? QString(info->name) : QVariant();
- } else {
- if (index.row() - 2 < (int)results.time_per_category.size()) {
- return GetDataForColumn(index.column(), results.time_per_category[index.row() - 2]);
- } else {
- return QVariant();
- }
- }
}
}
@@ -148,6 +128,8 @@ void ProfilerWidget::setProfilingInfoUpdateEnabled(bool enable)
}
}
+#if MICROPROFILE_ENABLED
+
class MicroProfileWidget : public QWidget {
public:
MicroProfileWidget(QWidget* parent = nullptr);
@@ -171,6 +153,8 @@ private:
QTimer update_timer;
};
+#endif
+
MicroProfileDialog::MicroProfileDialog(QWidget* parent)
: QWidget(parent, Qt::Dialog)
{
@@ -180,6 +164,8 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent)
// Remove the "?" button from the titlebar and enable the maximize button
setWindowFlags(windowFlags() & ~Qt::WindowContextHelpButtonHint | Qt::WindowMaximizeButtonHint);
+#if MICROPROFILE_ENABLED
+
MicroProfileWidget* widget = new MicroProfileWidget(this);
QLayout* layout = new QVBoxLayout(this);
@@ -191,6 +177,7 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent)
setFocusProxy(widget);
widget->setFocusPolicy(Qt::StrongFocus);
widget->setFocus();
+#endif
}
QAction* MicroProfileDialog::toggleViewAction() {
@@ -218,6 +205,9 @@ void MicroProfileDialog::hideEvent(QHideEvent* ev) {
QWidget::hideEvent(ev);
}
+
+#if MICROPROFILE_ENABLED
+
/// There's no way to pass a user pointer to MicroProfile, so this variable is used to make the
/// QPainter available inside the drawing callbacks.
static QPainter* mp_painter = nullptr;
@@ -337,3 +327,4 @@ void MicroProfileDrawLine2D(u32 vertices_length, float* vertices, u32 hex_color)
mp_painter->drawPolyline(point_buf.data(), vertices_length);
point_buf.clear();
}
+#endif
diff --git a/src/citra_qt/debugger/profiler.h b/src/citra_qt/debugger/profiler.h
index 036054740..3b38ed8ec 100644
--- a/src/citra_qt/debugger/profiler.h
+++ b/src/citra_qt/debugger/profiler.h
@@ -7,8 +7,10 @@
#include <QAbstractItemModel>
#include <QDockWidget>
#include <QTimer>
+
#include "ui_profiler.h"
+#include "common/microprofile.h"
#include "common/profiler_reporting.h"
class ProfilerModel : public QAbstractItemModel
@@ -49,6 +51,7 @@ private:
QTimer update_timer;
};
+
class MicroProfileDialog : public QWidget {
Q_OBJECT
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 2ca1e51f6..f1ab29755 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -69,8 +69,10 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr)
addDockWidget(Qt::BottomDockWidgetArea, profilerWidget);
profilerWidget->hide();
+#if MICROPROFILE_ENABLED
microProfileDialog = new MicroProfileDialog(this);
microProfileDialog->hide();
+#endif
disasmWidget = new DisassemblerWidget(this, emu_thread.get());
addDockWidget(Qt::BottomDockWidgetArea, disasmWidget);
@@ -110,7 +112,9 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr)
QMenu* debug_menu = ui.menu_View->addMenu(tr("Debugging"));
debug_menu->addAction(profilerWidget->toggleViewAction());
+#if MICROPROFILE_ENABLED
debug_menu->addAction(microProfileDialog->toggleViewAction());
+#endif
debug_menu->addAction(disasmWidget->toggleViewAction());
debug_menu->addAction(registersWidget->toggleViewAction());
debug_menu->addAction(callstackWidget->toggleViewAction());
@@ -136,8 +140,10 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr)
restoreGeometry(UISettings::values.geometry);
restoreState(UISettings::values.state);
render_window->restoreGeometry(UISettings::values.renderwindow_geometry);
+#if MICROPROFILE_ENABLED
microProfileDialog->restoreGeometry(UISettings::values.microprofile_geometry);
microProfileDialog->setVisible(UISettings::values.microprofile_visible);
+#endif
game_list->LoadInterfaceLayout();
@@ -511,9 +517,10 @@ void GMainWindow::closeEvent(QCloseEvent* event) {
UISettings::values.geometry = saveGeometry();
UISettings::values.state = saveState();
UISettings::values.renderwindow_geometry = render_window->saveGeometry();
+#if MICROPROFILE_ENABLED
UISettings::values.microprofile_geometry = microProfileDialog->saveGeometry();
UISettings::values.microprofile_visible = microProfileDialog->isVisible();
-
+#endif
UISettings::values.single_window_mode = ui.action_Single_Window_Mode->isChecked();
UISettings::values.display_titlebar = ui.actionDisplay_widget_title_bars->isChecked();
UISettings::values.first_start = false;
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index c839ce173..aa6eee2a3 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -47,7 +47,6 @@ set(HEADERS
microprofile.h
microprofileui.h
platform.h
- profiler.h
profiler_reporting.h
scm_rev.h
scope_exit.h
diff --git a/src/common/microprofile.h b/src/common/microprofile.h
index d3b6cb97c..ef312c6e1 100644
--- a/src/common/microprofile.h
+++ b/src/common/microprofile.h
@@ -4,6 +4,10 @@
#pragma once
+// Uncomment this to disable microprofile. This will get you cleaner profiles when using
+// external sampling profilers like "Very Sleepy", and will improve performance somewhat.
+// #define MICROPROFILE_ENABLED 0
+
// Customized Citra settings.
// This file wraps the MicroProfile header so that these are consistent everywhere.
#define MICROPROFILE_WEBSERVER 0
diff --git a/src/common/microprofileui.h b/src/common/microprofileui.h
index 97c369bd9..41abe6b75 100644
--- a/src/common/microprofileui.h
+++ b/src/common/microprofileui.h
@@ -13,4 +13,7 @@
#define MICROPROFILE_HELP_ALT "Right-Click"
#define MICROPROFILE_HELP_MOD "Ctrl"
+// This isn't included by microprofileui.h :(
+#include <cstdlib> // For std::abs
+
#include <microprofileui.h>
diff --git a/src/common/profiler.cpp b/src/common/profiler.cpp
index 7792edd2f..49eb3f40c 100644
--- a/src/common/profiler.cpp
+++ b/src/common/profiler.cpp
@@ -7,71 +7,16 @@
#include <vector>
#include "common/assert.h"
-#include "common/profiler.h"
#include "common/profiler_reporting.h"
#include "common/synchronized_wrapper.h"
-#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013.
- #define WIN32_LEAN_AND_MEAN
- #include <Windows.h> // For QueryPerformanceCounter/Frequency
-#endif
-
namespace Common {
namespace Profiling {
-#if ENABLE_PROFILING
-thread_local Timer* Timer::current_timer = nullptr;
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
-QPCClock::time_point QPCClock::now() {
- static LARGE_INTEGER freq;
- // Use this dummy local static to ensure this gets initialized once.
- static BOOL dummy = QueryPerformanceFrequency(&freq);
-
- LARGE_INTEGER ticks;
- QueryPerformanceCounter(&ticks);
-
- // This is prone to overflow when multiplying, which is why I'm using micro instead of nano. The
- // correct way to approach this would be to just return ticks as a time_point and then subtract
- // and do this conversion when creating a duration from two time_points, however, as far as I
- // could tell the C++ requirements for these types are incompatible with this approach.
- return time_point(duration(ticks.QuadPart * std::micro::den / freq.QuadPart));
-}
-#endif
-
-TimingCategory::TimingCategory(const char* name, TimingCategory* parent)
- : accumulated_duration(0) {
-
- ProfilingManager& manager = GetProfilingManager();
- category_id = manager.RegisterTimingCategory(this, name);
- if (parent != nullptr)
- manager.SetTimingCategoryParent(category_id, parent->category_id);
-}
-
ProfilingManager::ProfilingManager()
: last_frame_end(Clock::now()), this_frame_start(Clock::now()) {
}
-unsigned int ProfilingManager::RegisterTimingCategory(TimingCategory* category, const char* name) {
- TimingCategoryInfo info;
- info.category = category;
- info.name = name;
- info.parent = TimingCategoryInfo::NO_PARENT;
-
- unsigned int id = (unsigned int)timing_categories.size();
- timing_categories.push_back(std::move(info));
-
- return id;
-}
-
-void ProfilingManager::SetTimingCategoryParent(unsigned int category, unsigned int parent) {
- ASSERT(category < timing_categories.size());
- ASSERT(parent < timing_categories.size());
-
- timing_categories[category].parent = parent;
-}
-
void ProfilingManager::BeginFrame() {
this_frame_start = Clock::now();
}
@@ -82,11 +27,6 @@ void ProfilingManager::FinishFrame() {
results.interframe_time = now - last_frame_end;
results.frame_time = now - this_frame_start;
- results.time_per_category.resize(timing_categories.size());
- for (size_t i = 0; i < timing_categories.size(); ++i) {
- results.time_per_category[i] = timing_categories[i].category->GetAccumulatedTime();
- }
-
last_frame_end = now;
}
@@ -100,26 +40,9 @@ void TimingResultsAggregator::Clear() {
window_size = cursor = 0;
}
-void TimingResultsAggregator::SetNumberOfCategories(size_t n) {
- size_t old_size = times_per_category.size();
- if (n == old_size)
- return;
-
- times_per_category.resize(n);
-
- for (size_t i = old_size; i < n; ++i) {
- times_per_category[i].resize(max_window_size, Duration::zero());
- }
-}
-
void TimingResultsAggregator::AddFrame(const ProfilingFrameResult& frame_result) {
- SetNumberOfCategories(frame_result.time_per_category.size());
-
interframe_times[cursor] = frame_result.interframe_time;
frame_times[cursor] = frame_result.frame_time;
- for (size_t i = 0; i < frame_result.time_per_category.size(); ++i) {
- times_per_category[i][cursor] = frame_result.time_per_category[i];
- }
++cursor;
if (cursor == max_window_size)
@@ -162,11 +85,6 @@ AggregatedFrameResult TimingResultsAggregator::GetAggregatedResults() const {
result.fps = 0.0f;
}
- result.time_per_category.resize(times_per_category.size());
- for (size_t i = 0; i < times_per_category.size(); ++i) {
- result.time_per_category[i] = AggregateField(times_per_category[i], window_size);
- }
-
return result;
}
diff --git a/src/common/profiler.h b/src/common/profiler.h
deleted file mode 100644
index 3e967b4bc..000000000
--- a/src/common/profiler.h
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <atomic>
-#include <chrono>
-
-#include "common/assert.h"
-#include "common/thread.h"
-
-namespace Common {
-namespace Profiling {
-
-// If this is defined to 0, it turns all Timers into no-ops.
-#ifndef ENABLE_PROFILING
-#define ENABLE_PROFILING 1
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
-// MSVC up to 2013 doesn't use QueryPerformanceCounter for high_resolution_clock, so it has bad
-// precision. We manually implement a clock based on QPC to get good results.
-
-struct QPCClock {
- using duration = std::chrono::microseconds;
- using time_point = std::chrono::time_point<QPCClock>;
- using rep = duration::rep;
- using period = duration::period;
- static const bool is_steady = false;
-
- static time_point now();
-};
-
-using Clock = QPCClock;
-#else
-using Clock = std::chrono::high_resolution_clock;
-#endif
-
-using Duration = Clock::duration;
-
-/**
- * Represents a timing category that measured time can be accounted towards. Should be declared as a
- * global variable and passed to Timers.
- */
-class TimingCategory final {
-public:
- TimingCategory(const char* name, TimingCategory* parent = nullptr);
-
- unsigned int GetCategoryId() const {
- return category_id;
- }
-
- /// Adds some time to this category. Can safely be called from multiple threads at the same time.
- void AddTime(Duration amount) {
- std::atomic_fetch_add_explicit(
- &accumulated_duration, amount.count(),
- std::memory_order_relaxed);
- }
-
- /**
- * Atomically retrieves the accumulated measured time for this category and resets the counter
- * to zero. Can be safely called concurrently with AddTime.
- */
- Duration GetAccumulatedTime() {
- return Duration(std::atomic_exchange_explicit(
- &accumulated_duration, (Duration::rep)0,
- std::memory_order_relaxed));
- }
-
-private:
- unsigned int category_id;
- std::atomic<Duration::rep> accumulated_duration;
-};
-
-/**
- * Measures time elapsed between a call to Start and a call to Stop and attributes it to the given
- * TimingCategory. Start/Stop can be called multiple times on the same timer, but each call must be
- * appropriately paired.
- *
- * When a Timer is started, it automatically pauses a previously running timer on the same thread,
- * which is resumed when it is stopped. As such, no special action needs to be taken to avoid
- * double-accounting of time on two categories.
- */
-class Timer {
-public:
- Timer(TimingCategory& category) : category(category) {
- }
-
- void Start() {
-#if ENABLE_PROFILING
- ASSERT(!running);
- previous_timer = current_timer;
- current_timer = this;
- if (previous_timer != nullptr)
- previous_timer->StopTiming();
-
- StartTiming();
-#endif
- }
-
- void Stop() {
-#if ENABLE_PROFILING
- ASSERT(running);
- StopTiming();
-
- if (previous_timer != nullptr)
- previous_timer->StartTiming();
- current_timer = previous_timer;
-#endif
- }
-
-private:
-#if ENABLE_PROFILING
- void StartTiming() {
- start = Clock::now();
- running = true;
- }
-
- void StopTiming() {
- auto duration = Clock::now() - start;
- running = false;
- category.AddTime(std::chrono::duration_cast<Duration>(duration));
- }
-
- Clock::time_point start;
- bool running = false;
-
- Timer* previous_timer;
- static thread_local Timer* current_timer;
-#endif
-
- TimingCategory& category;
-};
-
-/**
- * A Timer that automatically starts timing when created and stops at the end of the scope. Should
- * be used in the majority of cases.
- */
-class ScopeTimer : public Timer {
-public:
- ScopeTimer(TimingCategory& category) : Timer(category) {
- Start();
- }
-
- ~ScopeTimer() {
- Stop();
- }
-};
-
-} // namespace Profiling
-} // namespace Common
diff --git a/src/common/profiler_reporting.h b/src/common/profiler_reporting.h
index df98e05b7..fa1ac883f 100644
--- a/src/common/profiler_reporting.h
+++ b/src/common/profiler_reporting.h
@@ -4,22 +4,17 @@
#pragma once
+#include <chrono>
#include <cstddef>
#include <vector>
-#include "common/profiler.h"
#include "common/synchronized_wrapper.h"
namespace Common {
namespace Profiling {
-struct TimingCategoryInfo {
- static const unsigned int NO_PARENT = -1;
-
- TimingCategory* category;
- const char* name;
- unsigned int parent;
-};
+using Clock = std::chrono::high_resolution_clock;
+using Duration = Clock::duration;
struct ProfilingFrameResult {
/// Time since the last delivered frame
@@ -27,22 +22,12 @@ struct ProfilingFrameResult {
/// Time spent processing a frame, excluding VSync
Duration frame_time;
-
- /// Total amount of time spent inside each category in this frame. Indexed by the category id
- std::vector<Duration> time_per_category;
};
class ProfilingManager final {
public:
ProfilingManager();
- unsigned int RegisterTimingCategory(TimingCategory* category, const char* name);
- void SetTimingCategoryParent(unsigned int category, unsigned int parent);
-
- const std::vector<TimingCategoryInfo>& GetTimingCategoriesInfo() const {
- return timing_categories;
- }
-
/// This should be called after swapping screen buffers.
void BeginFrame();
/// This should be called before swapping screen buffers.
@@ -54,7 +39,6 @@ public:
}
private:
- std::vector<TimingCategoryInfo> timing_categories;
Clock::time_point last_frame_end;
Clock::time_point this_frame_start;
@@ -73,9 +57,6 @@ struct AggregatedFrameResult {
AggregatedDuration frame_time;
float fps;
-
- /// Total amount of time spent inside each category in this frame. Indexed by the category id
- std::vector<AggregatedDuration> time_per_category;
};
class TimingResultsAggregator final {
@@ -83,7 +64,6 @@ public:
TimingResultsAggregator(size_t window_size);
void Clear();
- void SetNumberOfCategories(size_t n);
void AddFrame(const ProfilingFrameResult& frame_result);
@@ -95,7 +75,6 @@ public:
std::vector<Duration> interframe_times;
std::vector<Duration> frame_times;
- std::vector<std::vector<Duration>> times_per_category;
};
ProfilingManager& GetProfilingManager();
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index 647784208..8d4b26815 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -10,7 +10,6 @@
#include "common/common_types.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "core/memory.h"
#include "core/hle/svc.h"
@@ -25,9 +24,6 @@
#include "core/gdbstub/gdbstub.h"
-Common::Profiling::TimingCategory profile_execute("DynCom::Execute");
-Common::Profiling::TimingCategory profile_decode("DynCom::Decode");
-
enum {
COND = (1 << 0),
NON_BRANCH = (1 << 1),
@@ -3496,7 +3492,6 @@ static unsigned int InterpreterTranslateInstruction(const ARMul_State* cpu, cons
}
static int InterpreterTranslateBlock(ARMul_State* cpu, int& bb_start, u32 addr) {
- Common::Profiling::ScopeTimer timer_decode(profile_decode);
MICROPROFILE_SCOPE(DynCom_Decode);
// Decode instruction, get index
@@ -3530,7 +3525,6 @@ static int InterpreterTranslateBlock(ARMul_State* cpu, int& bb_start, u32 addr)
}
static int InterpreterTranslateSingle(ARMul_State* cpu, int& bb_start, u32 addr) {
- Common::Profiling::ScopeTimer timer_decode(profile_decode);
MICROPROFILE_SCOPE(DynCom_Decode);
ARM_INST_PTR inst_base = nullptr;
@@ -3565,7 +3559,6 @@ static int clz(unsigned int x) {
MICROPROFILE_DEFINE(DynCom_Execute, "DynCom", "Execute", MP_RGB(255, 0, 0));
unsigned InterpreterMainLoop(ARMul_State* cpu) {
- Common::Profiling::ScopeTimer timer_execute(profile_execute);
MICROPROFILE_SCOPE(DynCom_Execute);
GDBStub::BreakpointAddress breakpoint_data;
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 211fcf599..233592d7f 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -4,7 +4,6 @@
#include "common/bit_field.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "core/memory.h"
#include "core/hle/kernel/event.h"
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index ae54afb1c..a9a1a3244 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -6,7 +6,6 @@
#include "common/logging/log.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "common/string_util.h"
#include "common/symbols.h"
@@ -1031,8 +1030,6 @@ static const FunctionDef SVC_Table[] = {
{0x7D, HLE::Wrap<QueryProcessMemory>, "QueryProcessMemory"},
};
-Common::Profiling::TimingCategory profiler_svc("SVC Calls");
-
static const FunctionDef* GetSVCInfo(u32 func_num) {
if (func_num >= ARRAY_SIZE(SVC_Table)) {
LOG_ERROR(Kernel_SVC, "unknown svc=0x%02X", func_num);
@@ -1044,7 +1041,6 @@ static const FunctionDef* GetSVCInfo(u32 func_num) {
MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
void CallSVC(u32 immediate) {
- Common::Profiling::ScopeTimer timer_svc(profiler_svc);
MICROPROFILE_SCOPE(Kernel_SVC);
const FunctionDef* info = GetSVCInfo(immediate);
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 76cfd4f7d..de4082b1f 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SRCS
shader/shader_interpreter.cpp
swrasterizer.cpp
utils.cpp
+ vertex_loader.cpp
video_core.cpp
)
@@ -43,6 +44,7 @@ set(HEADERS
shader/shader_interpreter.h
swrasterizer.h
utils.h
+ vertex_loader.h
video_core.h
)
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 3abe79c09..58883e374 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -7,7 +7,6 @@
#include "common/alignment.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "core/settings.h"
#include "core/hle/service/gsp_gpu.h"
@@ -22,6 +21,7 @@
#include "video_core/video_core.h"
#include "video_core/debug_utils/debug_utils.h"
#include "video_core/shader/shader_interpreter.h"
+#include "video_core/vertex_loader.h"
namespace Pica {
@@ -35,8 +35,6 @@ static int default_attr_counter = 0;
static u32 default_attr_write_buffer[3];
-Common::Profiling::TimingCategory category_drawing("Drawing");
-
// Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF
static const u32 expand_bits_to_bytes[] = {
0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
@@ -186,60 +184,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
case PICA_REG_INDEX(trigger_draw):
case PICA_REG_INDEX(trigger_draw_indexed):
{
- Common::Profiling::ScopeTimer scope_timer(category_drawing);
MICROPROFILE_SCOPE(GPU_Drawing);
#if PICA_LOG_TEV
DebugUtils::DumpTevStageConfig(regs.GetTevStages());
#endif
-
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
- const auto& attribute_config = regs.vertex_attributes;
- const u32 base_address = attribute_config.GetPhysicalBaseAddress();
-
- // Information about internal vertex attributes
- u32 vertex_attribute_sources[16];
- boost::fill(vertex_attribute_sources, 0xdeadbeef);
- u32 vertex_attribute_strides[16] = {};
- Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
-
- u32 vertex_attribute_elements[16] = {};
- u32 vertex_attribute_element_size[16] = {};
-
- // Setup attribute data from loaders
- for (int loader = 0; loader < 12; ++loader) {
- const auto& loader_config = attribute_config.attribute_loaders[loader];
-
- u32 offset = 0;
-
- // TODO: What happens if a loader overwrites a previous one's data?
- for (unsigned component = 0; component < loader_config.component_count; ++component) {
- if (component >= 12) {
- LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component);
- continue;
- }
-
- u32 attribute_index = loader_config.GetComponent(component);
- if (attribute_index < 12) {
- int element_size = attribute_config.GetElementSizeInBytes(attribute_index);
- offset = Common::AlignUp(offset, element_size);
- vertex_attribute_sources[attribute_index] = base_address + loader_config.data_offset + offset;
- vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
- vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
- vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
- vertex_attribute_element_size[attribute_index] = element_size;
- offset += attribute_config.GetStride(attribute_index);
- } else if (attribute_index < 16) {
- // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
- offset = Common::AlignUp(offset, 4);
- offset += (attribute_index - 11) * 4;
- } else {
- UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
- }
- }
- }
+ // Processes information about internal vertex attributes to figure out how a vertex is loaded.
+ // Later, these can be compiled and cached.
+ VertexLoader loader;
+ const u32 base_address = regs.vertex_attributes.GetPhysicalBaseAddress();
+ loader.Setup(regs);
// Load vertices
bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
@@ -263,32 +220,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
}
}
- class {
- /// Combine overlapping and close ranges
- void SimplifyRanges() {
- for (auto it = ranges.begin(); it != ranges.end(); ++it) {
- // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
- auto it2 = std::next(it);
- while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
- it->second = std::max(it->second, it2->first + it2->second - it->first);
- it2 = ranges.erase(it2);
- }
- }
- }
-
- public:
- /// Record a particular memory access in the list
- void AddAccess(u32 paddr, u32 size) {
- // Create new range or extend existing one
- ranges[paddr] = std::max(ranges[paddr], size);
-
- // Simplify ranges...
- SimplifyRanges();
- }
-
- /// Map of accessed ranges (mapping start address to range size)
- std::map<u32, u32> ranges;
- } memory_accesses;
+ DebugUtils::MemoryAccessTracker memory_accesses;
// Simple circular-replacement vertex cache
// The size has been tuned for optimal balance between hit-rate and the cost of lookup
@@ -332,60 +264,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
if (!vertex_cache_hit) {
// Initialize data for the current vertex
Shader::InputVertex input;
-
- for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
- if (vertex_attribute_elements[i] != 0) {
- // Default attribute values set if array elements have < 4 components. This
- // is *not* carried over from the default attribute settings even if they're
- // enabled for this attribute.
- static const float24 zero = float24::FromFloat32(0.0f);
- static const float24 one = float24::FromFloat32(1.0f);
- input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
-
- // Load per-vertex data from the loader arrays
- for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
- u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
- const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
-
- if (g_debug_context && Pica::g_debug_context->recorder) {
- memory_accesses.AddAccess(source_addr,
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
- : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
- }
-
- const float srcval =
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *reinterpret_cast<const s8*>(srcdata) :
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *reinterpret_cast<const u8*>(srcdata) :
- (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *reinterpret_cast<const s16*>(srcdata) :
- *reinterpret_cast<const float*>(srcdata);
-
- input.attr[i][comp] = float24::FromFloat32(srcval);
- LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
- comp, i, vertex, index,
- attribute_config.GetPhysicalBaseAddress(),
- vertex_attribute_sources[i] - base_address,
- vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
- input.attr[i][comp].ToFloat32());
- }
- } else if (attribute_config.IsDefaultAttribute(i)) {
- // Load the default attribute if we're configured to do so
- input.attr[i] = g_state.vs.default_attributes[i];
- LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
- i, vertex, index,
- input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
- input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
- } else {
- // TODO(yuriks): In this case, no data gets loaded and the vertex
- // remains with the last value it had. This isn't currently maintained
- // as global state, however, and so won't work in Citra yet.
- }
- }
+ loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
if (g_debug_context)
g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
// Send to vertex shader
- output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes());
+ output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
if (is_indexed) {
vertex_cache[vertex_cache_pos] = output;
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 56f9bd958..dd0828cee 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -216,6 +216,36 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
+/**
+ * Used in the vertex loader to merge access records. TODO: Investigate if actually useful.
+ */
+class MemoryAccessTracker {
+ /// Combine overlapping and close ranges
+ void SimplifyRanges() {
+ for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+ // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
+ auto it2 = std::next(it);
+ while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
+ it->second = std::max(it->second, it2->first + it2->second - it->first);
+ it2 = ranges.erase(it2);
+ }
+ }
+ }
+
+public:
+ /// Record a particular memory access in the list
+ void AddAccess(u32 paddr, u32 size) {
+ // Create new range or extend existing one
+ ranges[paddr] = std::max(ranges[paddr], size);
+
+ // Simplify ranges...
+ SimplifyRanges();
+ }
+
+ /// Map of accessed ranges (mapping start address to range size)
+ std::map<u32, u32> ranges;
+};
+
} // namespace
} // namespace
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 0434ad05a..9cf77b1f2 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -9,7 +9,6 @@
#include "common/common_types.h"
#include "common/math_util.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "core/memory.h"
#include "core/hw/gpu.h"
@@ -287,7 +286,6 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
return Math::Cross(vec1, vec2).z;
};
-static Common::Profiling::TimingCategory rasterization_category("Rasterization");
MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
/**
@@ -300,7 +298,6 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
bool reversed = false)
{
const auto& regs = g_state.regs;
- Common::Profiling::ScopeTimer timer(rasterization_category);
MICROPROFILE_SCOPE(GPU_Rasterization);
// vertex positions in rasterizer coordinates
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 30187d4cf..a8c775c80 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -11,7 +11,6 @@
#include "common/file_util.h"
#include "common/math_util.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "core/memory.h"
#include "core/settings.h"
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 75301accd..043e99190 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -9,7 +9,6 @@
#include "common/hash.h"
#include "common/microprofile.h"
-#include "common/profiler.h"
#include "video_core/debug_utils/debug_utils.h"
#include "video_core/pica.h"
@@ -57,13 +56,11 @@ void Shutdown() {
#endif // ARCHITECTURE_x86_64
}
-static Common::Profiling::TimingCategory shader_category("Vertex Shader");
MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
auto& config = g_state.regs.vs;
- Common::Profiling::ScopeTimer timer(shader_category);
MICROPROFILE_SCOPE(GPU_VertexShader);
state.program_counter = config.main_offset;
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 9c5bd97bd..9ce9344d2 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -25,7 +25,7 @@ namespace Pica {
namespace Shader {
struct InputVertex {
- Math::Vec4<float24> attr[16];
+ alignas(16) Math::Vec4<float24> attr[16];
};
struct OutputVertex {
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
new file mode 100644
index 000000000..8a3d91896
--- /dev/null
+++ b/src/video_core/vertex_loader.cpp
@@ -0,0 +1,140 @@
+#include <cmath>
+#include <string>
+
+#include "boost/range/algorithm/fill.hpp"
+
+#include "common/assert.h"
+#include "common/alignment.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+
+#include "core/memory.h"
+
+#include "video_core/debug_utils/debug_utils.h"
+#include "video_core/pica.h"
+#include "video_core/pica_state.h"
+#include "video_core/pica_types.h"
+#include "video_core/vertex_loader.h"
+
+namespace Pica {
+
+void VertexLoader::Setup(const Pica::Regs& regs) {
+ const auto& attribute_config = regs.vertex_attributes;
+ num_total_attributes = attribute_config.GetNumTotalAttributes();
+
+ boost::fill(vertex_attribute_sources, 0xdeadbeef);
+
+ for (int i = 0; i < 16; i++) {
+ vertex_attribute_is_default[i] = attribute_config.IsDefaultAttribute(i);
+ }
+
+ // Setup attribute data from loaders
+ for (int loader = 0; loader < 12; ++loader) {
+ const auto& loader_config = attribute_config.attribute_loaders[loader];
+
+ u32 offset = 0;
+
+ // TODO: What happens if a loader overwrites a previous one's data?
+ for (unsigned component = 0; component < loader_config.component_count; ++component) {
+ if (component >= 12) {
+ LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component);
+ continue;
+ }
+
+ u32 attribute_index = loader_config.GetComponent(component);
+ if (attribute_index < 12) {
+ offset = Common::AlignUp(offset, attribute_config.GetElementSizeInBytes(attribute_index));
+ vertex_attribute_sources[attribute_index] = loader_config.data_offset + offset;
+ vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
+ vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
+ vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
+ offset += attribute_config.GetStride(attribute_index);
+ } else if (attribute_index < 16) {
+ // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
+ offset = Common::AlignUp(offset, 4);
+ offset += (attribute_index - 11) * 4;
+ } else {
+ UNREACHABLE(); // This is truly unreachable due to the number of bits for each component
+ }
+ }
+ }
+}
+
+void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses) {
+ for (int i = 0; i < num_total_attributes; ++i) {
+ if (vertex_attribute_elements[i] != 0) {
+ // Load per-vertex data from the loader arrays
+ u32 source_addr = base_address + vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex;
+
+ if (g_debug_context && Pica::g_debug_context->recorder) {
+ memory_accesses.AddAccess(source_addr, vertex_attribute_elements[i] * (
+ (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
+ : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1));
+ }
+
+ switch (vertex_attribute_formats[i]) {
+ case Regs::VertexAttributeFormat::BYTE:
+ {
+ const s8* srcdata = reinterpret_cast<const s8*>(Memory::GetPhysicalPointer(source_addr));
+ for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+ input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+ }
+ break;
+ }
+ case Regs::VertexAttributeFormat::UBYTE:
+ {
+ const u8* srcdata = reinterpret_cast<const u8*>(Memory::GetPhysicalPointer(source_addr));
+ for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+ input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+ }
+ break;
+ }
+ case Regs::VertexAttributeFormat::SHORT:
+ {
+ const s16* srcdata = reinterpret_cast<const s16*>(Memory::GetPhysicalPointer(source_addr));
+ for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+ input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+ }
+ break;
+ }
+ case Regs::VertexAttributeFormat::FLOAT:
+ {
+ const float* srcdata = reinterpret_cast<const float*>(Memory::GetPhysicalPointer(source_addr));
+ for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+ input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+ }
+ break;
+ }
+ }
+
+ // Default attribute values set if array elements have < 4 components. This
+ // is *not* carried over from the default attribute settings even if they're
+ // enabled for this attribute.
+ for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) {
+ input.attr[i][comp] = comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+ }
+
+ LOG_TRACE(HW_GPU, "Loaded %d components of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f %f %f %f",
+ vertex_attribute_elements[i], i, vertex, index,
+ base_address,
+ vertex_attribute_sources[i],
+ vertex_attribute_strides[i] * vertex,
+ input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
+ } else if (vertex_attribute_is_default[i]) {
+ // Load the default attribute if we're configured to do so
+ input.attr[i] = g_state.vs.default_attributes[i];
+ LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
+ i, vertex, index,
+ input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
+ input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
+ } else {
+ // TODO(yuriks): In this case, no data gets loaded and the vertex
+ // remains with the last value it had. This isn't currently maintained
+ // as global state, however, and so won't work in Citra yet.
+ }
+ }
+}
+
+} // namespace Pica \ No newline at end of file
diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h
new file mode 100644
index 000000000..ff42d1596
--- /dev/null
+++ b/src/video_core/vertex_loader.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <iterator>
+#include <algorithm>
+
+#include "video_core/pica.h"
+#include "video_core/shader/shader.h"
+#include "video_core/debug_utils/debug_utils.h"
+
+namespace Pica {
+
+class VertexLoader {
+public:
+ void Setup(const Pica::Regs& regs);
+ void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses);
+
+ int GetNumTotalAttributes() const { return num_total_attributes; }
+
+private:
+ u32 vertex_attribute_sources[16];
+ u32 vertex_attribute_strides[16] = {};
+ Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
+ u32 vertex_attribute_elements[16] = {};
+ bool vertex_attribute_is_default[16];
+ int num_total_attributes;
+};
+
+} // namespace Pica