speed optimisations, fix for restoreColorLossy, code cleanup (#4895)
- speed optimization in color_add, PS fast_color_add and blur functions - applying more bit and shift manipulation tricks to squeeze out just a bit more speed on color manipulation functions. - Optimization on blur is based on work by @blazoncek - Renamed PS fast_color_add() to fast_color_scaleAdd()
This commit is contained in:
@@ -145,7 +145,7 @@ void WS2812FX::setUpMatrix() {
|
||||
#ifndef WLED_DISABLE_2D
|
||||
// pixel is clipped if it falls outside clipping range
|
||||
// if clipping start > stop the clipping range is inverted
|
||||
bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
|
||||
bool Segment::isPixelXYClipped(int x, int y) const {
|
||||
if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
|
||||
const bool invertX = _clipStart > _clipStop;
|
||||
const bool invertY = _clipStartY > _clipStopY;
|
||||
@@ -185,7 +185,7 @@ bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
|
||||
void IRAM_ATTR_YN Segment::setPixelColorXY(int x, int y, uint32_t col) const
|
||||
{
|
||||
if (!isActive()) return; // not active
|
||||
if (x >= (int)vWidth() || y >= (int)vHeight() || x < 0 || y < 0) return; // if pixel would fall out of virtual segment just exit
|
||||
if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return; // if pixel would fall out of virtual segment just exit
|
||||
setPixelColorXYRaw(x, y, col);
|
||||
}
|
||||
|
||||
@@ -235,7 +235,7 @@ void Segment::setPixelColorXY(float x, float y, uint32_t col, bool aa) const
|
||||
// returns RGBW values of pixel
|
||||
uint32_t IRAM_ATTR_YN Segment::getPixelColorXY(int x, int y) const {
|
||||
if (!isActive()) return 0; // not active
|
||||
if (x >= (int)vWidth() || y >= (int)vHeight() || x<0 || y<0) return 0; // if pixel would fall out of virtual segment just exit
|
||||
if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return 0; // if pixel would fall out of virtual segment just exit
|
||||
return getPixelColorXYRaw(x,y);
|
||||
}
|
||||
|
||||
@@ -245,52 +245,42 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const {
|
||||
const unsigned cols = vWidth();
|
||||
const unsigned rows = vHeight();
|
||||
const auto XY = [&](unsigned x, unsigned y){ return x + y*cols; };
|
||||
uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
|
||||
uint32_t last;
|
||||
if (blur_x) {
|
||||
const uint8_t keepx = smear ? 255 : 255 - blur_x;
|
||||
const uint8_t seepx = blur_x >> 1;
|
||||
for (unsigned row = 0; row < rows; row++) { // blur rows (x direction)
|
||||
uint32_t carryover = BLACK;
|
||||
uint32_t curnew = BLACK;
|
||||
for (unsigned x = 0; x < cols; x++) {
|
||||
uint32_t cur = getPixelColorRaw(XY(x, row));
|
||||
uint32_t part = color_fade(cur, seepx);
|
||||
curnew = color_fade(cur, keepx);
|
||||
if (x > 0) {
|
||||
if (carryover) curnew = color_add(curnew, carryover);
|
||||
uint32_t prev = color_add(lastnew, part);
|
||||
// optimization: only set pixel if color has changed
|
||||
if (last != prev) setPixelColorRaw(XY(x - 1, row), prev);
|
||||
} else setPixelColorRaw(XY(x, row), curnew); // first pixel
|
||||
lastnew = curnew;
|
||||
last = cur; // save original value for comparison on next iteration
|
||||
// handle first pixel in row to avoid conditional in loop (faster)
|
||||
uint32_t cur = getPixelColorRaw(XY(0, row));
|
||||
uint32_t carryover = fast_color_scale(cur, seepx);
|
||||
setPixelColorRaw(XY(0, row), fast_color_scale(cur, keepx));
|
||||
for (unsigned x = 1; x < cols; x++) {
|
||||
cur = getPixelColorRaw(XY(x, row));
|
||||
uint32_t part = fast_color_scale(cur, seepx);
|
||||
cur = fast_color_scale(cur, keepx);
|
||||
cur = color_add(cur, carryover);
|
||||
setPixelColorRaw(XY(x - 1, row), color_add(getPixelColorRaw(XY(x-1, row)), part)); // previous pixel
|
||||
setPixelColorRaw(XY(x, row), cur); // current pixel
|
||||
carryover = part;
|
||||
}
|
||||
setPixelColorRaw(XY(cols-1, row), curnew); // set last pixel
|
||||
}
|
||||
}
|
||||
if (blur_y) {
|
||||
const uint8_t keepy = smear ? 255 : 255 - blur_y;
|
||||
const uint8_t seepy = blur_y >> 1;
|
||||
for (unsigned col = 0; col < cols; col++) {
|
||||
uint32_t carryover = BLACK;
|
||||
uint32_t curnew = BLACK;
|
||||
for (unsigned y = 0; y < rows; y++) {
|
||||
uint32_t cur = getPixelColorRaw(XY(col, y));
|
||||
uint32_t part = color_fade(cur, seepy);
|
||||
curnew = color_fade(cur, keepy);
|
||||
if (y > 0) {
|
||||
if (carryover) curnew = color_add(curnew, carryover);
|
||||
uint32_t prev = color_add(lastnew, part);
|
||||
// optimization: only set pixel if color has changed
|
||||
if (last != prev) setPixelColorRaw(XY(col, y - 1), prev);
|
||||
} else setPixelColorRaw(XY(col, y), curnew); // first pixel
|
||||
lastnew = curnew;
|
||||
last = cur; //save original value for comparison on next iteration
|
||||
// handle first pixel in column
|
||||
uint32_t cur = getPixelColorRaw(XY(col, 0));
|
||||
uint32_t carryover = fast_color_scale(cur, seepy);
|
||||
setPixelColorRaw(XY(col, 0), fast_color_scale(cur, keepy));
|
||||
for (unsigned y = 1; y < rows; y++) {
|
||||
cur = getPixelColorRaw(XY(col, y));
|
||||
uint32_t part = fast_color_scale(cur, seepy);
|
||||
cur = fast_color_scale(cur, keepy);
|
||||
cur = color_add(cur, carryover);
|
||||
setPixelColorRaw(XY(col, y - 1), color_add(getPixelColorRaw(XY(col, y-1)), part)); // previous pixel
|
||||
setPixelColorRaw(XY(col, y), cur); // current pixel
|
||||
carryover = part;
|
||||
}
|
||||
setPixelColorRaw(XY(col, rows - 1), curnew);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -682,7 +682,7 @@ uint16_t Segment::maxMappingLength() const {
|
||||
#endif
|
||||
// pixel is clipped if it falls outside clipping range
|
||||
// if clipping start > stop the clipping range is inverted
|
||||
bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
|
||||
bool Segment::isPixelClipped(int i) const {
|
||||
if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
|
||||
bool invert = _clipStart > _clipStop; // ineverted start & stop
|
||||
int start = invert ? _clipStop : _clipStart;
|
||||
@@ -700,7 +700,7 @@ bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
void IRAM_ATTR_YN Segment::setPixelColor(int i, uint32_t col) const
|
||||
void WLED_O2_ATTR Segment::setPixelColor(int i, uint32_t col) const
|
||||
{
|
||||
if (!isActive() || i < 0) return; // not active or invalid index
|
||||
#ifndef WLED_DISABLE_2D
|
||||
@@ -913,7 +913,7 @@ void Segment::setPixelColor(float i, uint32_t col, bool aa) const
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const
|
||||
uint32_t WLED_O2_ATTR Segment::getPixelColor(int i) const
|
||||
{
|
||||
if (!isActive() || i < 0) return 0; // not active or invalid index
|
||||
|
||||
@@ -1052,7 +1052,7 @@ void Segment::fadeToSecondaryBy(uint8_t fadeBy) const {
|
||||
void Segment::fadeToBlackBy(uint8_t fadeBy) const {
|
||||
if (!isActive() || fadeBy == 0) return; // optimization - no scaling to apply
|
||||
const size_t rlength = rawLength(); // calculate only once
|
||||
for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, color_fade(getPixelColorRaw(i), 255-fadeBy));
|
||||
for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, fast_color_scale(getPixelColorRaw(i), 255-fadeBy));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1072,25 +1072,19 @@ void Segment::blur(uint8_t blur_amount, bool smear) const {
|
||||
uint8_t keep = smear ? 255 : 255 - blur_amount;
|
||||
uint8_t seep = blur_amount >> 1;
|
||||
unsigned vlength = vLength();
|
||||
uint32_t carryover = BLACK;
|
||||
uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
|
||||
uint32_t last;
|
||||
uint32_t curnew = BLACK;
|
||||
for (unsigned i = 0; i < vlength; i++) {
|
||||
uint32_t cur = getPixelColorRaw(i);
|
||||
uint32_t part = color_fade(cur, seep);
|
||||
curnew = color_fade(cur, keep);
|
||||
if (i > 0) {
|
||||
if (carryover) curnew = color_add(curnew, carryover);
|
||||
uint32_t prev = color_add(lastnew, part);
|
||||
// optimization: only set pixel if color has changed
|
||||
if (last != prev) setPixelColorRaw(i - 1, prev);
|
||||
} else setPixelColorRaw(i, curnew); // first pixel
|
||||
lastnew = curnew;
|
||||
last = cur; // save original value for comparison on next iteration
|
||||
// handle first pixel to avoid conditional in loop (faster)
|
||||
uint32_t cur = getPixelColorRaw(0);
|
||||
uint32_t carryover = fast_color_scale(cur, seep);
|
||||
setPixelColorRaw(0, fast_color_scale(cur, keep));
|
||||
for (unsigned i = 1; i < vlength; i++) {
|
||||
cur = getPixelColorRaw(i);
|
||||
uint32_t part = fast_color_scale(cur, seep);
|
||||
cur = fast_color_scale(cur, keep);
|
||||
cur = color_add(cur, carryover);
|
||||
setPixelColorRaw(i - 1, color_add(getPixelColorRaw(i - 1), part)); // previous pixel
|
||||
setPixelColorRaw(i, cur); // current pixel
|
||||
carryover = part;
|
||||
}
|
||||
setPixelColorRaw(vlength - 1, curnew);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
// local shared functions (used both in 1D and 2D system)
|
||||
static int32_t calcForce_dv(const int8_t force, uint8_t &counter);
|
||||
static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius
|
||||
static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
|
||||
static uint32_t fast_color_scale(CRGBW c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255
|
||||
static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
|
||||
#endif
|
||||
|
||||
#ifndef WLED_DISABLE_PARTICLESYSTEM2D
|
||||
@@ -625,7 +624,7 @@ void ParticleSystem2D::render() {
|
||||
}
|
||||
|
||||
// calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
|
||||
__attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
|
||||
void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
|
||||
uint32_t size = particlesize;
|
||||
if (advPartProps && advPartProps[particleindex].size > 0) // use advanced size properties (0 means use global size including single pixel rendering)
|
||||
size = advPartProps[particleindex].size;
|
||||
@@ -635,7 +634,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
|
||||
uint32_t y = particles[particleindex].y >> PS_P_RADIUS_SHIFT;
|
||||
if (x <= (uint32_t)maxXpixel && y <= (uint32_t)maxYpixel) {
|
||||
uint32_t index = x + (maxYpixel - y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
|
||||
framebuffer[index] = fast_color_add(framebuffer[index], color, brightness);
|
||||
framebuffer[index] = fast_color_scaleAdd(framebuffer[index], color, brightness);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -687,10 +686,10 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
|
||||
memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
|
||||
//particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10
|
||||
//first, render the pixel to the center of the renderbuffer, then apply 2D blurring
|
||||
renderbuffer[4 + (4 * 10)] = fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
|
||||
renderbuffer[5 + (4 * 10)] = fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
|
||||
renderbuffer[5 + (5 * 10)] = fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
|
||||
renderbuffer[4 + (5 * 10)] = fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
|
||||
renderbuffer[4 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
|
||||
renderbuffer[5 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
|
||||
renderbuffer[5 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
|
||||
renderbuffer[4 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
|
||||
uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4
|
||||
uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
|
||||
uint32_t maxsize = advPartProps[particleindex].size;
|
||||
@@ -748,7 +747,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
|
||||
continue;
|
||||
}
|
||||
uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
|
||||
framebuffer[idx] = fast_color_add(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
|
||||
framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
|
||||
}
|
||||
}
|
||||
} else { // standard rendering (2x2 pixels)
|
||||
@@ -785,7 +784,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
|
||||
for (uint32_t i = 0; i < 4; i++) {
|
||||
if (pixelvalid[i]) {
|
||||
uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
|
||||
framebuffer[idx] = fast_color_add(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
|
||||
framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -857,7 +856,7 @@ void ParticleSystem2D::handleCollisions() {
|
||||
|
||||
// handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
|
||||
// takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
|
||||
__attribute__((optimize("O2"))) void ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
|
||||
void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
|
||||
int32_t distanceSquared = dx * dx + dy * dy;
|
||||
// Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
|
||||
int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
|
||||
@@ -1028,9 +1027,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
|
||||
for (uint32_t x = xstart; x < xstart + xsize; x++) {
|
||||
seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
|
||||
if (x > 0) {
|
||||
colorbuffer[indexXY - 1] = fast_color_add(colorbuffer[indexXY - 1], seeppart);
|
||||
if (carryover.color32) // note: check adds overhead but is faster on average
|
||||
colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
|
||||
colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart);
|
||||
colorbuffer[indexXY] = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
|
||||
}
|
||||
carryover = seeppart;
|
||||
indexXY++; // next pixel in x direction
|
||||
@@ -1049,9 +1047,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
|
||||
for (uint32_t y = ystart; y < ystart + ysize; y++) {
|
||||
seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
|
||||
if (y > 0) {
|
||||
colorbuffer[indexXY - width] = fast_color_add(colorbuffer[indexXY - width], seeppart);
|
||||
if (carryover.color32) // note: check adds overhead but is faster on average
|
||||
colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
|
||||
colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart);
|
||||
colorbuffer[indexXY] = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
|
||||
}
|
||||
carryover = seeppart;
|
||||
indexXY += width; // next pixel in y direction
|
||||
@@ -1470,7 +1467,7 @@ void ParticleSystem1D::render() {
|
||||
CRGBW bg_color = SEGCOLOR(1);
|
||||
if (bg_color > 0) { //if not black
|
||||
for (int32_t i = 0; i <= maxXpixel; i++) {
|
||||
framebuffer[i] = fast_color_add(framebuffer[i], bg_color);
|
||||
framebuffer[i] = fast_color_scaleAdd(framebuffer[i], bg_color);
|
||||
}
|
||||
}
|
||||
#ifndef WLED_DISABLE_2D
|
||||
@@ -1485,7 +1482,7 @@ void ParticleSystem1D::render() {
|
||||
}
|
||||
|
||||
// calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
|
||||
__attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
|
||||
void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
|
||||
uint32_t size = particlesize;
|
||||
if (advPartProps) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
|
||||
size = advPartProps[particleindex].size;
|
||||
@@ -1493,7 +1490,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
|
||||
if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
|
||||
uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
|
||||
if (x <= (uint32_t)maxXpixel) { //by making x unsigned there is no need to check < 0 as it will overflow
|
||||
framebuffer[x] = fast_color_add(framebuffer[x], color, brightness);
|
||||
framebuffer[x] = fast_color_scaleAdd(framebuffer[x], color, brightness);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -1530,8 +1527,8 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
|
||||
//render particle to a bigger size
|
||||
//particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
|
||||
//first, render the pixel to the center of the renderbuffer, then apply 1D blurring
|
||||
renderbuffer[4] = fast_color_add(renderbuffer[4], color, pxlbrightness[0]);
|
||||
renderbuffer[5] = fast_color_add(renderbuffer[5], color, pxlbrightness[1]);
|
||||
renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]);
|
||||
renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]);
|
||||
uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
|
||||
uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
|
||||
uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
|
||||
@@ -1565,7 +1562,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
|
||||
#ifdef ESP8266 // no local buffer on ESP8266
|
||||
SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
|
||||
#else
|
||||
framebuffer[xfb] = fast_color_add(framebuffer[xfb], renderbuffer[xrb]);
|
||||
framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -1585,7 +1582,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
|
||||
}
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
if (pxlisinframe[i]) {
|
||||
framebuffer[pixco[i]] = fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]);
|
||||
framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1648,7 +1645,7 @@ void ParticleSystem1D::handleCollisions() {
|
||||
}
|
||||
// handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
|
||||
// takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
|
||||
__attribute__((optimize("O2"))) void ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
|
||||
void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
|
||||
int32_t dv = particle2.vx - particle1.vx;
|
||||
int32_t dotProduct = (dx * dv); // is always negative if moving towards each other
|
||||
|
||||
@@ -1837,9 +1834,8 @@ void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
|
||||
for (uint32_t x = start; x < start + size; x++) {
|
||||
seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
|
||||
if (x > 0) {
|
||||
colorbuffer[x-1] = fast_color_add(colorbuffer[x-1], seeppart);
|
||||
if (carryover.color32) // note: check adds overhead but is faster on average
|
||||
colorbuffer[x] = fast_color_add(colorbuffer[x], carryover); // is black on first pass
|
||||
colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart);
|
||||
colorbuffer[x] = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass
|
||||
}
|
||||
carryover = seeppart;
|
||||
}
|
||||
@@ -1888,36 +1884,34 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
|
||||
return true; // particle is in bounds
|
||||
}
|
||||
|
||||
// this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color
|
||||
// this is a fast version for RGB color adding ignoring white channel (PS does not handle white) including scaling of second color
|
||||
// note: function is mainly used to add scaled colors, so checking if one color is black is slower
|
||||
// note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer
|
||||
__attribute__((optimize("O2"))) static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
|
||||
uint32_t r, g, b;
|
||||
r = c1.r + ((c2.r * scale) >> 8);
|
||||
g = c1.g + ((c2.g * scale) >> 8);
|
||||
b = c1.b + ((c2.b * scale) >> 8);
|
||||
static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, const uint8_t scale) {
|
||||
constexpr uint32_t MASK_RB = 0x00FF00FF; // red and blue mask
|
||||
constexpr uint32_t MASK_G = 0x0000FF00; // green mask
|
||||
|
||||
// note: this chained comparison is the fastest method for max of 3 values (faster than std:max() or using xor)
|
||||
uint32_t max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b);
|
||||
if (max <= 255) {
|
||||
c1.r = r; // save result to c1
|
||||
c1.g = g;
|
||||
c1.b = b;
|
||||
} else {
|
||||
uint32_t newscale = (255U << 16) / max;
|
||||
c1.r = (r * newscale) >> 16;
|
||||
c1.g = (g * newscale) >> 16;
|
||||
c1.b = (b * newscale) >> 16;
|
||||
}
|
||||
return c1.color32;
|
||||
}
|
||||
uint32_t rb = c2 & MASK_RB; // 0x00RR00BB
|
||||
uint32_t g = c2 & MASK_G; // 0x0000GG00
|
||||
// scale second color
|
||||
rb = ((rb * scale) >> 8) & MASK_RB;
|
||||
g = ((g * scale) >> 8) & MASK_G;
|
||||
// add colors
|
||||
rb = (c1 & MASK_RB) + rb;
|
||||
g = ((c1 & MASK_G) + g);
|
||||
|
||||
// fast CRGBW color scaling ignoring white channel (PS does not handle white)
|
||||
__attribute__((optimize("O2"))) static uint32_t fast_color_scale(CRGBW c, const uint8_t scale) {
|
||||
c.r = ((c.r * scale) >> 8);
|
||||
c.g = ((c.g * scale) >> 8);
|
||||
c.b = ((c.b * scale) >> 8);
|
||||
return c.color32;
|
||||
// check for overflow by looking at the 9th bit of each channel
|
||||
if ((rb | (g >> 8)) & 0x01000100) {
|
||||
// find max among the three 16-bit values
|
||||
g = g >> 8; // shift to get 0x000000GG
|
||||
uint32_t max_val = (rb >> 16); // red
|
||||
max_val = ((rb & 0xFFFF) > max_val) ? rb & 0xFFFF : max_val; // blue
|
||||
max_val = (g > max_val) ? g : max_val; // green
|
||||
// scale down to avoid saturation
|
||||
uint32_t scale_factor = (255 << 8) / max_val;
|
||||
rb = ((rb * scale_factor) >> 8) & MASK_RB;
|
||||
g = (g * scale_factor) & MASK_G;
|
||||
}
|
||||
return rb | g;
|
||||
}
|
||||
|
||||
#endif // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D))
|
||||
|
||||
@@ -233,6 +233,7 @@ void BusDigital::estimateCurrent() {
|
||||
|
||||
void BusDigital::applyBriLimit(uint8_t newBri) {
|
||||
// a newBri of 0 means calculate per-bus brightness limit
|
||||
_NPBbri = 255; // reset, intermediate value is set below, final value is calculated in bus::show()
|
||||
if (newBri == 0) {
|
||||
if (_milliAmpsLimit == 0 || _milliAmpsTotal == 0) return; // ABL not used for this bus
|
||||
newBri = 255;
|
||||
@@ -250,6 +251,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {
|
||||
}
|
||||
|
||||
if (newBri < 255) {
|
||||
_NPBbri = newBri; // store value so it can be updated in show() (must be updated even if ABL is not used)
|
||||
uint8_t cctWW = 0, cctCW = 0;
|
||||
unsigned hwLen = _len;
|
||||
if (_type == TYPE_WS2812_1CH_X3) hwLen = NUM_ICS_WS2812_1CH_3X(_len); // only needs a third of "RGB" LEDs for NeoPixelBus
|
||||
@@ -267,6 +269,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {
|
||||
|
||||
void BusDigital::show() {
|
||||
if (!_valid) return;
|
||||
_NPBbri = (_NPBbri * _bri) / 255; // total applied brightness for use in restoreColorLossy (see applyBriLimit())
|
||||
PolyBus::show(_busPtr, _iType, _skip); // faster if buffer consistency is not important (no skipped LEDs)
|
||||
}
|
||||
|
||||
@@ -329,7 +332,7 @@ uint32_t IRAM_ATTR BusDigital::getPixelColor(unsigned pix) const {
|
||||
if (_reversed) pix = _len - pix -1;
|
||||
pix += _skip;
|
||||
const uint8_t co = _colorOrderMap.getPixelColorOrder(pix+_start, _colorOrder);
|
||||
uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_bri);
|
||||
uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_NPBbri);
|
||||
if (_type == TYPE_WS2812_1CH_X3) { // map to correct IC, each controls 3 LEDs
|
||||
uint8_t r = R(c);
|
||||
uint8_t g = _reversed ? B(c) : G(c); // should G and B be switched if _reversed?
|
||||
|
||||
@@ -112,6 +112,7 @@ class Bus {
|
||||
Bus(uint8_t type, uint16_t start, uint8_t aw, uint16_t len = 1, bool reversed = false, bool refresh = false)
|
||||
: _type(type)
|
||||
, _bri(255)
|
||||
, _NPBbri(255)
|
||||
, _start(start)
|
||||
, _len(std::max(len,(uint16_t)1))
|
||||
, _reversed(reversed)
|
||||
@@ -210,7 +211,9 @@ class Bus {
|
||||
|
||||
protected:
|
||||
uint8_t _type;
|
||||
uint8_t _bri;
|
||||
uint8_t _bri; // bus brightness
|
||||
uint8_t _NPBbri; // total brightness applied to colors in NPB buffer (_bri + ABL)
|
||||
uint8_t _autoWhiteMode; // global Auto White Calculation override
|
||||
uint16_t _start;
|
||||
uint16_t _len;
|
||||
//struct { //using bitfield struct adds abour 250 bytes to binary size
|
||||
@@ -221,8 +224,6 @@ class Bus {
|
||||
bool _hasWhite;// : 1;
|
||||
bool _hasCCT;// : 1;
|
||||
//} __attribute__ ((packed));
|
||||
uint8_t _autoWhiteMode;
|
||||
// global Auto White Calculation override
|
||||
static uint8_t _gAWM;
|
||||
// _cct has the following meanings (see calculateCCT() & BusManager::setSegmentCCT()):
|
||||
// -1 means to extract approximate CCT value in K from RGB (in calcualteCCT())
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* color blend function, based on FastLED blend function
|
||||
* the calculation for each color is: result = (A*(amountOfA) + A + B*(amountOfB) + B) / 256 with amountOfA = 255 - amountOfB
|
||||
*/
|
||||
uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
|
||||
uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
|
||||
// min / max blend checking is omitted: calls with 0 or 255 are rare, checking lowers overall performance
|
||||
const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated (poorman's SIMD; https://github.com/wled/WLED/pull/4568#discussion_r1986587221)
|
||||
uint32_t rb1 = color1 & TWO_CHANNEL_MASK; // extract R & B channels from color1
|
||||
@@ -25,39 +25,37 @@ uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend)
|
||||
* original idea: https://github.com/wled-dev/WLED/pull/2465 by https://github.com/Proto-molecule
|
||||
* speed optimisations by @dedehai
|
||||
*/
|
||||
uint32_t color_add(uint32_t c1, uint32_t c2, bool preserveCR)
|
||||
uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //1212558 | 1212598 | 1212576 | 1212530
|
||||
{
|
||||
if (c1 == BLACK) return c2;
|
||||
if (c2 == BLACK) return c1;
|
||||
const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated
|
||||
uint32_t rb = ( c1 & TWO_CHANNEL_MASK) + ( c2 & TWO_CHANNEL_MASK); // mask and add two colors at once
|
||||
uint32_t wg = ((c1>>8) & TWO_CHANNEL_MASK) + ((c2>>8) & TWO_CHANNEL_MASK);
|
||||
uint32_t r = rb >> 16; // extract single color values
|
||||
uint32_t b = rb & 0xFFFF;
|
||||
uint32_t w = wg >> 16;
|
||||
uint32_t g = wg & 0xFFFF;
|
||||
|
||||
if (preserveCR) { // preserve color ratios
|
||||
uint32_t max = std::max(r,g); // check for overflow note
|
||||
max = std::max(max,b);
|
||||
max = std::max(max,w);
|
||||
//unsigned max = r; // check for overflow note
|
||||
//max = g > max ? g : max;
|
||||
//max = b > max ? b : max;
|
||||
//max = w > max ? w : max;
|
||||
if (max > 255) {
|
||||
uint32_t overflow = (rb | wg) & 0x01000100; // detect overflow by checking 9th bit
|
||||
if (overflow) {
|
||||
uint32_t r = rb >> 16; // extract single color values
|
||||
uint32_t b = rb & 0xFFFF;
|
||||
uint32_t w = wg >> 16;
|
||||
uint32_t g = wg & 0xFFFF;
|
||||
uint32_t max = std::max(r,g);
|
||||
max = std::max(max,b);
|
||||
max = std::max(max,w);
|
||||
const uint32_t scale = (uint32_t(255)<<8) / max; // division of two 8bit (shifted) values does not work -> use bit shifts and multiplaction instead
|
||||
rb = ((rb * scale) >> 8) & TWO_CHANNEL_MASK;
|
||||
wg = (wg * scale) & ~TWO_CHANNEL_MASK;
|
||||
} else wg <<= 8; //shift white and green back to correct position
|
||||
return rb | wg;
|
||||
} else {
|
||||
r = r > 255 ? 255 : r;
|
||||
g = g > 255 ? 255 : g;
|
||||
b = b > 255 ? 255 : b;
|
||||
w = w > 255 ? 255 : w;
|
||||
return RGBW32(r,g,b,w);
|
||||
// branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF, input is 0xFF+0xFF=0x1EF max)
|
||||
// example with overflow: input: 0x01EF01EF -> (0x0100100 - 0x00010001) = 0x00FF00FF -> input|0x00FF00FF = 0x00FF00FF (saturate)
|
||||
// example without overflow: input: 0x007F007F -> (0x00000000 - 0x00000000) = 0x00000000 -> input|0x00000000 = input (no change)
|
||||
rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF;
|
||||
wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF;
|
||||
wg <<= 8; // restore WG position
|
||||
}
|
||||
return rb | wg;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -92,15 +90,15 @@ uint32_t IRAM_ATTR color_fade(uint32_t c1, uint8_t amount, bool video) {
|
||||
note: inputs are 32bit to speed up the function, useful input value ranges are 0-255
|
||||
*/
|
||||
uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten) {
|
||||
if (rgb == 0 | hueShift + lighten + brighten == 0) return rgb; // black or no change
|
||||
CHSV32 hsv;
|
||||
rgb2hsv(rgb, hsv); //convert to HSV
|
||||
hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
|
||||
hsv.s = max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
|
||||
hsv.v = min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
|
||||
uint32_t rgb_adjusted;
|
||||
hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
|
||||
return rgb_adjusted;
|
||||
if (rgb == 0 || hueShift + lighten + brighten == 0) return rgb; // black or no change
|
||||
CHSV32 hsv;
|
||||
rgb2hsv(rgb, hsv); //convert to HSV
|
||||
hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
|
||||
hsv.s = max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
|
||||
hsv.v = min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
|
||||
uint32_t rgb_adjusted;
|
||||
hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
|
||||
return rgb_adjusted;
|
||||
}
|
||||
|
||||
// 1:1 replacement of fastled function optimized for ESP, slightly faster, more accurate and uses less flash (~ -200bytes)
|
||||
@@ -597,13 +595,13 @@ void NeoGammaWLEDMethod::calcGammaTable(float gamma)
|
||||
gammaT_inv[0] = 0;
|
||||
}
|
||||
|
||||
uint8_t IRAM_ATTR_YN NeoGammaWLEDMethod::Correct(uint8_t value)
|
||||
uint8_t NeoGammaWLEDMethod::Correct(uint8_t value)
|
||||
{
|
||||
if (!gammaCorrectCol) return value;
|
||||
return gammaT[value];
|
||||
}
|
||||
|
||||
uint32_t IRAM_ATTR_YN NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
|
||||
uint32_t NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
|
||||
{
|
||||
if (!gammaCorrectCol) return color;
|
||||
uint8_t w = W(color);
|
||||
|
||||
@@ -117,6 +117,7 @@ class NeoGammaWLEDMethod {
|
||||
[[gnu::hot, gnu::pure]] uint32_t color_blend(uint32_t c1, uint32_t c2 , uint8_t blend);
|
||||
inline uint32_t color_blend16(uint32_t c1, uint32_t c2, uint16_t b) { return color_blend(c1, c2, b >> 8); };
|
||||
[[gnu::hot, gnu::pure]] uint32_t color_add(uint32_t, uint32_t, bool preserveCR = false);
|
||||
[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
|
||||
[[gnu::hot, gnu::pure]] uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten);
|
||||
[[gnu::hot, gnu::pure]] uint32_t ColorFromPaletteWLED(const CRGBPalette16 &pal, unsigned index, uint8_t brightness = (uint8_t)255U, TBlendType blendType = LINEARBLEND);
|
||||
CRGBPalette16 generateHarmonicRandomPalette(const CRGBPalette16 &basepalette);
|
||||
@@ -139,7 +140,13 @@ uint32_t colorBalanceFromKelvin(uint16_t kelvin, uint32_t rgb);
|
||||
uint16_t approximateKelvinFromRGB(uint32_t rgb);
|
||||
void setRandomColor(byte* rgb);
|
||||
|
||||
[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
|
||||
// fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy
|
||||
// note: inlining uses less code than actual function calls
|
||||
static inline uint32_t fast_color_scale(const uint32_t c, const uint8_t scale) {
|
||||
uint32_t rb = (((c & 0x00FF00FF) * scale) >> 8) & 0x00FF00FF;
|
||||
uint32_t wg = (((c>>8) & 0x00FF00FF) * scale) & ~0x00FF00FF;
|
||||
return rb | wg;
|
||||
}
|
||||
|
||||
// palettes
|
||||
extern const TProgmemRGBPalette16* const fastledPalettes[];
|
||||
|
||||
@@ -682,4 +682,6 @@ static_assert(WLED_MAX_BUSSES <= 32, "WLED_MAX_BUSSES exceeds hard limit");
|
||||
#define IRAM_ATTR_YN IRAM_ATTR
|
||||
#endif
|
||||
|
||||
#define WLED_O2_ATTR __attribute__((optimize("O2")))
|
||||
|
||||
#endif
|
||||
|
||||
@@ -45,13 +45,13 @@
|
||||
}
|
||||
function bLimits(b,v,p,m,l,o=5,d=2,a=6) {
|
||||
maxB = b; // maxB - max physical (analog + digital) buses: 32 - ESP32, 14 - S3/S2, 6 - C3, 4 - 8266
|
||||
maxD = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
|
||||
maxA = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
|
||||
maxV = v; // maxV - min virtual buses: 6 - ESP32/S3, 4 - S2/C3, 3 - ESP8266 (only used to distinguish S2/S3)
|
||||
maxPB = p; // maxPB - max LEDs per bus
|
||||
maxM = m; // maxM - max LED memory
|
||||
maxL = l; // maxL - max LEDs (will serve to determine ESP >1664 == ESP32)
|
||||
maxCO = o; // maxCO - max Color Order mappings
|
||||
maxD = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
|
||||
maxA = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
|
||||
}
|
||||
function is8266() { return maxA == 5 && maxD == 3; } // NOTE: see const.h
|
||||
function is32() { return maxA == 16 && maxD == 16; } // NOTE: see const.h
|
||||
|
||||
Reference in New Issue
Block a user