Optimize DrawHalfTransparentBlendedRectTo

`DrawHalfTransparentBlendedRectTo` takes up a significant chunk of time
when rendering store and quest dialogs.

Optimize the function to read 2 pixels at a time and write 4 pixels at a
time.
This commit is contained in:
Gleb Mazovetskiy 2021-11-07 01:11:52 +00:00
commit b7beb2fbe9
3 changed files with 87 additions and 7 deletions

View file

@ -12,6 +12,8 @@
*/
#include <array>
#include <cassert>
#include <cstdint>
#include "engine/render/common_impl.h"
#include "lighting.h"
@ -21,19 +23,71 @@
namespace devilution {
namespace {
void DrawHalfTransparentBlendedRectTo(const Surface &out, int sx, int sy, int width, int height)
// Expects everything to be 4-byte aligned.
void DrawHalfTransparentAligned32BlendedRectTo(const Surface &out, unsigned sx, unsigned sy, unsigned width, unsigned height)
{
BYTE *pix = out.at(sx, sy);
assert(out.pitch() % 4 == 0);
for (int row = 0; row < height; row++) {
for (int col = 0; col < width; col++) {
*pix = paletteTransparencyLookup[0][*pix];
pix++;
auto *pix = reinterpret_cast<uint32_t *>(out.at(static_cast<int>(sx), static_cast<int>(sy)));
assert(reinterpret_cast<intptr_t>(pix) % 4 == 0);
const uint16_t *lookupTable = paletteTransparencyLookupBlack16;
const unsigned skipX = (out.pitch() - width) / 4;
width /= 4;
while (height-- > 0) {
for (unsigned i = 0; i < width; ++i, ++pix) {
const uint32_t v = *pix;
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
*pix = lookupTable[v & 0xFFFF] | (lookupTable[(v >> 16) & 0xFFFF] << 16);
#else
*pix = lookupTable[(v >> 16) & 0xFFFF] | (lookupTable[v & 0xFFFF] << 16);
#endif
}
pix += out.pitch() - width;
pix += skipX;
}
}
void DrawHalfTransparentUnalignedBlendedRectTo(const Surface &out, unsigned sx, unsigned sy, unsigned width, unsigned height)
{
uint8_t *pix = out.at(static_cast<int>(sx), static_cast<int>(sy));
const uint8_t *lookupTable = paletteTransparencyLookup[0];
const unsigned skipX = out.pitch() - width;
for (unsigned y = 0; y < height; ++y) {
for (unsigned x = 0; x < width; ++x, ++pix) {
*pix = lookupTable[*pix];
}
pix += skipX;
}
}
void DrawHalfTransparentBlendedRectTo(const Surface &out, unsigned sx, unsigned sy, unsigned width, unsigned height)
{
// All SDL surfaces are 4-byte aligned and divisible by 4.
// However, our coordinates and widths may not be.
// First, draw the leading unaligned part.
if (sx % 4 != 0) {
const unsigned w = 4 - sx % 4;
DrawHalfTransparentUnalignedBlendedRectTo(out, sx, sy, w, height);
sx += w;
width -= w;
}
if (static_cast<int>(sx + width) == out.w()) {
// The pitch is 4-byte aligned, so we can simply extend the width to the pitch.
width = out.pitch() - sx;
} else if (width % 4 != 0) {
// Draw the trailing unaligned part.
const unsigned w = width % 4;
DrawHalfTransparentUnalignedBlendedRectTo(out, sx + (width / 4) * 4, sy, w, height);
width -= w;
}
// Now everything is divisible by 4. Draw the aligned part.
DrawHalfTransparentAligned32BlendedRectTo(out, sx, sy, width, height);
}
void DrawHalfTransparentStippledRectTo(const Surface &out, int sx, int sy, int width, int height)
{
BYTE *pix = out.at(sx, sy);