speed optimisations, fix for restoreColorLossy, code cleanup (#4895)

- speed optimization in color_add, PS fast_color_add and blur functions - applying more bit and shift manipulation tricks to squeeze out just a bit more speed on color manipulation functions. - Optimization on blur is based on work by @blazoncek - Renamed PS fast_color_add() to fast_color_scaleAdd()
2025-09-23 20:15:42 +02:00
parent c6b4c77387
commit 8b3975752c
9 changed files with 138 additions and 149 deletions
--- a/wled00/FX_2Dfcn.cpp
+++ b/wled00/FX_2Dfcn.cpp
@@ -145,7 +145,7 @@ void WS2812FX::setUpMatrix() {
 #ifndef WLED_DISABLE_2D
 // pixel is clipped if it falls outside clipping range
 // if clipping start > stop the clipping range is inverted
-bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
+bool Segment::isPixelXYClipped(int x, int y) const {
  if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
    const bool invertX = _clipStart  > _clipStop;
    const bool invertY = _clipStartY > _clipStopY;
@@ -185,7 +185,7 @@ bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
 void IRAM_ATTR_YN Segment::setPixelColorXY(int x, int y, uint32_t col) const
 {
  if (!isActive()) return; // not active
-  if (x >= (int)vWidth() || y >= (int)vHeight() || x < 0 || y < 0) return;  // if pixel would fall out of virtual segment just exit
+  if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return;  // if pixel would fall out of virtual segment just exit
  setPixelColorXYRaw(x, y, col);
 }

@@ -235,7 +235,7 @@ void Segment::setPixelColorXY(float x, float y, uint32_t col, bool aa) const
 // returns RGBW values of pixel
 uint32_t IRAM_ATTR_YN Segment::getPixelColorXY(int x, int y) const {
  if (!isActive()) return 0; // not active
-  if (x >= (int)vWidth() || y >= (int)vHeight() || x<0 || y<0) return 0;  // if pixel would fall out of virtual segment just exit
+  if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return 0;  // if pixel would fall out of virtual segment just exit
  return getPixelColorXYRaw(x,y);
 }

@@ -245,52 +245,42 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const {
  const unsigned cols = vWidth();
  const unsigned rows = vHeight();
  const auto XY = [&](unsigned x, unsigned y){ return x + y*cols; };
-  uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
-  uint32_t last;
  if (blur_x) {
    const uint8_t keepx = smear ? 255 : 255 - blur_x;
    const uint8_t seepx = blur_x >> 1;
    for (unsigned row = 0; row < rows; row++) { // blur rows (x direction)
-      uint32_t carryover = BLACK;
-      uint32_t curnew = BLACK;
-      for (unsigned x = 0; x < cols; x++) {
-        uint32_t cur = getPixelColorRaw(XY(x, row));
-        uint32_t part = color_fade(cur, seepx);
-        curnew = color_fade(cur, keepx);
-        if (x > 0) {
-          if (carryover) curnew = color_add(curnew, carryover);
-          uint32_t prev = color_add(lastnew, part);
-          // optimization: only set pixel if color has changed
-          if (last != prev) setPixelColorRaw(XY(x - 1, row), prev);
-        } else setPixelColorRaw(XY(x, row), curnew); // first pixel
-        lastnew = curnew;
-        last = cur; // save original value for comparison on next iteration
+      // handle first pixel in row to avoid conditional in loop (faster)
+      uint32_t cur = getPixelColorRaw(XY(0, row));
+      uint32_t carryover = fast_color_scale(cur, seepx);
+      setPixelColorRaw(XY(0, row), fast_color_scale(cur, keepx));
+      for (unsigned x = 1; x < cols; x++) {
+         cur = getPixelColorRaw(XY(x, row));
+        uint32_t part = fast_color_scale(cur, seepx);
+        cur = fast_color_scale(cur, keepx);
+        cur = color_add(cur, carryover);
+        setPixelColorRaw(XY(x - 1, row), color_add(getPixelColorRaw(XY(x-1, row)), part)); // previous pixel
+        setPixelColorRaw(XY(x, row), cur); // current pixel
        carryover = part;
      }
-      setPixelColorRaw(XY(cols-1, row), curnew); // set last pixel
    }
  }
  if (blur_y) {
    const uint8_t keepy = smear ? 255 : 255 - blur_y;
    const uint8_t seepy = blur_y >> 1;
    for (unsigned col = 0; col < cols; col++) {
-      uint32_t carryover = BLACK;
-      uint32_t curnew = BLACK;
-      for (unsigned y = 0; y < rows; y++) {
-        uint32_t cur = getPixelColorRaw(XY(col, y));
-        uint32_t part = color_fade(cur, seepy);
-        curnew = color_fade(cur, keepy);
-        if (y > 0) {
-          if (carryover) curnew = color_add(curnew, carryover);
-          uint32_t prev = color_add(lastnew, part);
-          // optimization: only set pixel if color has changed
-          if (last != prev) setPixelColorRaw(XY(col, y - 1), prev);
-        } else setPixelColorRaw(XY(col, y), curnew); // first pixel
-        lastnew = curnew;
-        last = cur; //save original value for comparison on next iteration
+      // handle first pixel in column
+      uint32_t cur = getPixelColorRaw(XY(col, 0));
+      uint32_t carryover = fast_color_scale(cur, seepy);
+      setPixelColorRaw(XY(col, 0), fast_color_scale(cur, keepy));
+      for (unsigned y = 1; y < rows; y++) {
+        cur = getPixelColorRaw(XY(col, y));
+        uint32_t part = fast_color_scale(cur, seepy);
+        cur = fast_color_scale(cur, keepy);
+        cur = color_add(cur, carryover);
+        setPixelColorRaw(XY(col, y - 1), color_add(getPixelColorRaw(XY(col, y-1)), part)); // previous pixel
+        setPixelColorRaw(XY(col, y), cur); // current pixel
        carryover = part;
      }
-      setPixelColorRaw(XY(col, rows - 1), curnew);
    }
  }
 }
--- a/wled00/FX_fcn.cpp
+++ b/wled00/FX_fcn.cpp
@@ -682,7 +682,7 @@ uint16_t Segment::maxMappingLength() const {
 #endif
 // pixel is clipped if it falls outside clipping range
 // if clipping start > stop the clipping range is inverted
-bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
+bool Segment::isPixelClipped(int i) const {
  if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
    bool invert = _clipStart > _clipStop;  // ineverted start & stop
    int start = invert ? _clipStop : _clipStart;
@@ -700,7 +700,7 @@ bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
  return false;
 }

-void IRAM_ATTR_YN Segment::setPixelColor(int i, uint32_t col) const
+void WLED_O2_ATTR Segment::setPixelColor(int i, uint32_t col) const
 {
  if (!isActive() || i < 0) return; // not active or invalid index
 #ifndef WLED_DISABLE_2D
@@ -913,7 +913,7 @@ void Segment::setPixelColor(float i, uint32_t col, bool aa) const
 }
 #endif

-uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const
+uint32_t WLED_O2_ATTR Segment::getPixelColor(int i) const
 {
  if (!isActive() || i < 0) return 0; // not active or invalid index

@@ -1052,7 +1052,7 @@ void Segment::fadeToSecondaryBy(uint8_t fadeBy) const {
 void Segment::fadeToBlackBy(uint8_t fadeBy) const {
  if (!isActive() || fadeBy == 0) return;   // optimization - no scaling to apply
  const size_t rlength = rawLength();  // calculate only once
-  for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, color_fade(getPixelColorRaw(i), 255-fadeBy));
+  for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, fast_color_scale(getPixelColorRaw(i), 255-fadeBy));
 }

 /*
@@ -1072,25 +1072,19 @@ void Segment::blur(uint8_t blur_amount, bool smear) const {
  uint8_t keep = smear ? 255 : 255 - blur_amount;
  uint8_t seep = blur_amount >> 1;
  unsigned vlength = vLength();
-  uint32_t carryover = BLACK;
-  uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
-  uint32_t last;
-  uint32_t curnew = BLACK;
-  for (unsigned i = 0; i < vlength; i++) {
-    uint32_t cur = getPixelColorRaw(i);
-    uint32_t part = color_fade(cur, seep);
-    curnew = color_fade(cur, keep);
-    if (i > 0) {
-      if (carryover) curnew = color_add(curnew, carryover);
-      uint32_t prev = color_add(lastnew, part);
-      // optimization: only set pixel if color has changed
-      if (last != prev) setPixelColorRaw(i - 1, prev);
-    } else setPixelColorRaw(i, curnew); // first pixel
-    lastnew = curnew;
-    last = cur; // save original value for comparison on next iteration
+  // handle first pixel to avoid conditional in loop (faster)
+  uint32_t cur = getPixelColorRaw(0);
+  uint32_t carryover = fast_color_scale(cur, seep);
+  setPixelColorRaw(0, fast_color_scale(cur, keep));
+  for (unsigned i = 1; i < vlength; i++) {
+    cur = getPixelColorRaw(i);
+    uint32_t part = fast_color_scale(cur, seep);
+    cur = fast_color_scale(cur, keep);
+    cur = color_add(cur, carryover);
+    setPixelColorRaw(i - 1, color_add(getPixelColorRaw(i - 1), part)); // previous pixel
+    setPixelColorRaw(i, cur); // current pixel
    carryover = part;
  }
-  setPixelColorRaw(vlength - 1, curnew);
 }

 /*
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -17,8 +17,7 @@
 // local shared functions (used both in 1D and 2D system)
 static int32_t calcForce_dv(const int8_t force, uint8_t &counter);
 static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius
-static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
-static uint32_t fast_color_scale(CRGBW c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255
+static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
 #endif

 #ifndef WLED_DISABLE_PARTICLESYSTEM2D
@@ -625,7 +624,7 @@ void ParticleSystem2D::render() {
 }

 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
-__attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
+void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
  uint32_t size = particlesize;
  if (advPartProps && advPartProps[particleindex].size > 0) // use advanced size properties (0 means use global size including single pixel rendering)
    size = advPartProps[particleindex].size;
@@ -635,7 +634,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
    uint32_t y = particles[particleindex].y >> PS_P_RADIUS_SHIFT;
    if (x <= (uint32_t)maxXpixel && y <= (uint32_t)maxYpixel) {
      uint32_t index = x + (maxYpixel - y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-      framebuffer[index] = fast_color_add(framebuffer[index], color, brightness);
+      framebuffer[index] = fast_color_scaleAdd(framebuffer[index], color, brightness);
    }
    return;
  }
@@ -687,10 +686,10 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
    memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
    //particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10
    //first, render the pixel to the center of the renderbuffer, then apply 2D blurring
-    renderbuffer[4 + (4 * 10)] = fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
-    renderbuffer[5 + (4 * 10)] = fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
-    renderbuffer[5 + (5 * 10)] = fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
-    renderbuffer[4 + (5 * 10)] = fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
+    renderbuffer[4 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
+    renderbuffer[5 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
+    renderbuffer[5 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
+    renderbuffer[4 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
    uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4
    uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
    uint32_t maxsize = advPartProps[particleindex].size;
@@ -748,7 +747,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
          continue;
        }
        uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_add(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
+        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
      }
    }
    } else { // standard rendering (2x2 pixels)
@@ -785,7 +784,7 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
    for (uint32_t i = 0; i < 4; i++) {
      if (pixelvalid[i]) {
        uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_add(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
+        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
      }
    }
  }
@@ -857,7 +856,7 @@ void ParticleSystem2D::handleCollisions() {

 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-__attribute__((optimize("O2"))) void ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
+void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
  int32_t distanceSquared = dx * dx + dy * dy;
  // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
  int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
@@ -1028,9 +1027,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
    for (uint32_t x = xstart; x < xstart + xsize; x++) {
      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
      if (x > 0) {
-        colorbuffer[indexXY - 1] = fast_color_add(colorbuffer[indexXY - 1], seeppart);
-        if (carryover.color32) // note: check adds overhead but is faster on average
-          colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
+        colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart);
+        colorbuffer[indexXY]     = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
      }
      carryover = seeppart;
      indexXY++; // next pixel in x direction
@@ -1049,9 +1047,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
    for (uint32_t y = ystart; y < ystart + ysize; y++) {
      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
      if (y > 0) {
-        colorbuffer[indexXY - width] = fast_color_add(colorbuffer[indexXY - width], seeppart);
-        if (carryover.color32) // note: check adds overhead but is faster on average
-          colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
+        colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart);
+        colorbuffer[indexXY]         = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
      }
      carryover = seeppart;
      indexXY += width; // next pixel in y direction
@@ -1470,7 +1467,7 @@ void ParticleSystem1D::render() {
  CRGBW bg_color = SEGCOLOR(1);
  if (bg_color > 0) { //if not black
    for (int32_t i = 0; i <= maxXpixel; i++) {
-      framebuffer[i] = fast_color_add(framebuffer[i], bg_color);
+      framebuffer[i] = fast_color_scaleAdd(framebuffer[i], bg_color);
    }
  }
 #ifndef WLED_DISABLE_2D
@@ -1485,7 +1482,7 @@ void ParticleSystem1D::render() {
 }

 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
-__attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
+void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
  uint32_t size = particlesize;
  if (advPartProps) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
    size = advPartProps[particleindex].size;
@@ -1493,7 +1490,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
  if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
    uint32_t x =  particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
    if (x <= (uint32_t)maxXpixel) { //by making x unsigned there is no need to check < 0 as it will overflow
-      framebuffer[x] = fast_color_add(framebuffer[x], color, brightness);
+      framebuffer[x] = fast_color_scaleAdd(framebuffer[x], color, brightness);
    }
    return;
  }
@@ -1530,8 +1527,8 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
    //render particle to a bigger size
    //particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
    //first, render the pixel to the center of the renderbuffer, then apply 1D blurring
-    renderbuffer[4] = fast_color_add(renderbuffer[4], color, pxlbrightness[0]);
-    renderbuffer[5] = fast_color_add(renderbuffer[5], color, pxlbrightness[1]);
+    renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]);
+    renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]);
    uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
    uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
    uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
@@ -1565,7 +1562,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
      #ifdef ESP8266 // no local buffer on ESP8266
      SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
      #else
-      framebuffer[xfb] = fast_color_add(framebuffer[xfb], renderbuffer[xrb]);
+      framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]);
      #endif
    }
  }
@@ -1585,7 +1582,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
    }
    for (uint32_t i = 0; i < 2; i++) {
      if (pxlisinframe[i]) {
-        framebuffer[pixco[i]] = fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]);
+        framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
      }
    }
  }
@@ -1648,7 +1645,7 @@ void ParticleSystem1D::handleCollisions() {
 }
 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-__attribute__((optimize("O2"))) void ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
+void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
  int32_t dv = particle2.vx - particle1.vx;
  int32_t dotProduct = (dx * dv); // is always negative if moving towards each other

@@ -1837,9 +1834,8 @@ void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
  for (uint32_t x = start; x < start + size; x++) {
    seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
    if (x > 0) {
-      colorbuffer[x-1] = fast_color_add(colorbuffer[x-1], seeppart);
-      if (carryover.color32) // note: check adds overhead but is faster on average
-        colorbuffer[x] = fast_color_add(colorbuffer[x], carryover); // is black on first pass
+      colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart);
+      colorbuffer[x]   = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass
    }
    carryover = seeppart;
  }
@@ -1888,36 +1884,34 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
  return true; // particle is in bounds
 }

-// this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color
+// this is a fast version for RGB color adding ignoring white channel (PS does not handle white) including scaling of second color
 // note: function is mainly used to add scaled colors, so checking if one color is black is slower
-// note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer
- __attribute__((optimize("O2"))) static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
-  uint32_t r, g, b;
-  r = c1.r + ((c2.r * scale) >> 8);
-  g = c1.g + ((c2.g * scale) >> 8);
-  b = c1.b + ((c2.b * scale) >> 8);
+static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, const uint8_t scale) {
+    constexpr uint32_t MASK_RB = 0x00FF00FF;  // red and blue mask
+    constexpr uint32_t MASK_G  = 0x0000FF00;  // green mask

-  // note: this chained comparison is the fastest method for max of 3 values (faster than std:max() or using xor)
-  uint32_t max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b);
-  if (max <= 255) {
-    c1.r = r; // save result to c1
-    c1.g = g;
-    c1.b = b;
-  } else {
-    uint32_t newscale = (255U << 16) / max;
-    c1.r = (r * newscale) >> 16;
-    c1.g = (g * newscale) >> 16;
-    c1.b = (b * newscale) >> 16;
-  }
-  return c1.color32;
-}
+    uint32_t rb = c2 & MASK_RB; // 0x00RR00BB
+    uint32_t g  = c2 & MASK_G;  // 0x0000GG00
+    // scale second color
+    rb = ((rb * scale) >> 8) & MASK_RB;
+    g  = ((g  * scale) >> 8) & MASK_G;
+    // add colors
+    rb = (c1 & MASK_RB) + rb;
+    g = ((c1 & MASK_G)  + g);

-// fast CRGBW color scaling ignoring white channel (PS does not handle white)
- __attribute__((optimize("O2"))) static uint32_t fast_color_scale(CRGBW c, const uint8_t scale) {
-  c.r = ((c.r * scale) >> 8);
-  c.g = ((c.g * scale) >> 8);
-  c.b = ((c.b * scale) >> 8);
-  return c.color32;
+    // check for overflow by looking at the 9th bit of each channel
+    if ((rb | (g >> 8)) & 0x01000100) {
+        // find max among the three 16-bit values
+        g = g >> 8; // shift to get 0x000000GG
+        uint32_t max_val = (rb >> 16); // red
+        max_val = ((rb & 0xFFFF) > max_val) ? rb & 0xFFFF : max_val;  // blue
+        max_val = (g > max_val) ? g : max_val; // green
+        // scale down to avoid saturation
+        uint32_t scale_factor = (255 << 8) / max_val;
+        rb = ((rb * scale_factor) >> 8) & MASK_RB;
+        g = (g * scale_factor) & MASK_G;
+    }
+    return rb | g;
 }

 #endif  // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D))
--- a/wled00/bus_manager.cpp
+++ b/wled00/bus_manager.cpp
@@ -233,6 +233,7 @@ void BusDigital::estimateCurrent() {

 void BusDigital::applyBriLimit(uint8_t newBri) {
  // a newBri of 0 means calculate per-bus brightness limit
+  _NPBbri = 255; // reset, intermediate value is set below, final value is calculated in bus::show()
  if (newBri == 0) {
    if (_milliAmpsLimit == 0 || _milliAmpsTotal == 0) return; // ABL not used for this bus
    newBri = 255;
@@ -250,6 +251,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {
  }

  if (newBri < 255) {
+    _NPBbri = newBri; // store value so it can be updated in show() (must be updated even if ABL is not used)
    uint8_t cctWW = 0, cctCW = 0;
    unsigned hwLen = _len;
    if (_type == TYPE_WS2812_1CH_X3) hwLen = NUM_ICS_WS2812_1CH_3X(_len); // only needs a third of "RGB" LEDs for NeoPixelBus
@@ -267,6 +269,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {

 void BusDigital::show() {
  if (!_valid) return;
+  _NPBbri = (_NPBbri * _bri) / 255;      // total applied brightness for use in restoreColorLossy (see applyBriLimit())
  PolyBus::show(_busPtr, _iType, _skip); // faster if buffer consistency is not important (no skipped LEDs)
 }

@@ -329,7 +332,7 @@ uint32_t IRAM_ATTR BusDigital::getPixelColor(unsigned pix) const {
  if (_reversed) pix = _len - pix -1;
  pix += _skip;
  const uint8_t co = _colorOrderMap.getPixelColorOrder(pix+_start, _colorOrder);
-  uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_bri);
+  uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_NPBbri);
  if (_type == TYPE_WS2812_1CH_X3) { // map to correct IC, each controls 3 LEDs
    uint8_t r = R(c);
    uint8_t g = _reversed ? B(c) : G(c); // should G and B be switched if _reversed?
--- a/wled00/bus_manager.h
+++ b/wled00/bus_manager.h
@@ -112,6 +112,7 @@ class Bus {
    Bus(uint8_t type, uint16_t start, uint8_t aw, uint16_t len = 1, bool reversed = false, bool refresh = false)
    : _type(type)
    , _bri(255)
+    , _NPBbri(255)
    , _start(start)
    , _len(std::max(len,(uint16_t)1))
    , _reversed(reversed)
@@ -210,7 +211,9 @@ class Bus {

  protected:
    uint8_t  _type;
-    uint8_t  _bri;
+    uint8_t  _bri;    // bus brightness
+    uint8_t  _NPBbri; // total brightness applied to colors in NPB buffer (_bri + ABL)
+    uint8_t  _autoWhiteMode; // global Auto White Calculation override
    uint16_t _start;
    uint16_t _len;
    //struct { //using bitfield struct adds abour 250 bytes to binary size
@@ -221,8 +224,6 @@ class Bus {
      bool _hasWhite;//     : 1;
      bool _hasCCT;//       : 1;
    //} __attribute__ ((packed));
-    uint8_t  _autoWhiteMode;
-    // global Auto White Calculation override
    static uint8_t _gAWM;
    // _cct has the following meanings (see calculateCCT() & BusManager::setSegmentCCT()):
    //    -1 means to extract approximate CCT value in K from RGB (in calcualteCCT())
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -8,7 +8,7 @@
 * color blend function, based on FastLED blend function
 * the calculation for each color is: result = (A*(amountOfA) + A + B*(amountOfB) + B) / 256 with amountOfA = 255 - amountOfB
 */
-uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
+uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
  // min / max blend checking is omitted: calls with 0 or 255 are rare, checking lowers overall performance
  const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF;     // mask for R and B channels or W and G if negated (poorman's SIMD; https://github.com/wled/WLED/pull/4568#discussion_r1986587221)
  uint32_t rb1 =  color1       & TWO_CHANNEL_MASK;  // extract R & B channels from color1
@@ -25,39 +25,37 @@ uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend)
 * original idea: https://github.com/wled-dev/WLED/pull/2465 by https://github.com/Proto-molecule
 * speed optimisations by @dedehai
 */
-uint32_t color_add(uint32_t c1, uint32_t c2, bool preserveCR)
+uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //1212558 | 1212598 | 1212576 | 1212530
 {
  if (c1 == BLACK) return c2;
  if (c2 == BLACK) return c1;
  const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated
  uint32_t rb = ( c1     & TWO_CHANNEL_MASK) + ( c2     & TWO_CHANNEL_MASK); // mask and add two colors at once
  uint32_t wg = ((c1>>8) & TWO_CHANNEL_MASK) + ((c2>>8) & TWO_CHANNEL_MASK);
-  uint32_t r = rb >> 16; // extract single color values
-  uint32_t b = rb & 0xFFFF;
-  uint32_t w = wg >> 16;
-  uint32_t g = wg & 0xFFFF;

  if (preserveCR) { // preserve color ratios
-    uint32_t max = std::max(r,g); // check for overflow note
-    max = std::max(max,b);
-    max = std::max(max,w);
-    //unsigned max = r; // check for overflow note
-    //max = g > max ? g : max;
-    //max = b > max ? b : max;
-    //max = w > max ? w : max;
-    if (max > 255) {
+    uint32_t overflow = (rb | wg) & 0x01000100; // detect overflow by checking 9th bit
+    if (overflow) {
+      uint32_t r = rb >> 16; // extract single color values
+      uint32_t b = rb & 0xFFFF;
+      uint32_t w = wg >> 16;
+      uint32_t g = wg & 0xFFFF;
+      uint32_t max = std::max(r,g);
+      max = std::max(max,b);
+      max = std::max(max,w);
      const uint32_t scale = (uint32_t(255)<<8) / max; // division of two 8bit (shifted) values does not work -> use bit shifts and multiplaction instead
      rb = ((rb * scale) >> 8) &  TWO_CHANNEL_MASK;
      wg =  (wg * scale)       & ~TWO_CHANNEL_MASK;
    } else wg <<= 8; //shift white and green back to correct position
-    return rb | wg;
  } else {
-    r = r > 255 ? 255 : r;
-    g = g > 255 ? 255 : g;
-    b = b > 255 ? 255 : b;
-    w = w > 255 ? 255 : w;
-    return RGBW32(r,g,b,w);
+    // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF, input is 0xFF+0xFF=0x1EF max)
+    // example with overflow: input: 0x01EF01EF -> (0x0100100 - 0x00010001) = 0x00FF00FF -> input|0x00FF00FF = 0x00FF00FF (saturate)
+    // example without overflow: input: 0x007F007F -> (0x00000000 - 0x00000000) = 0x00000000 -> input|0x00000000 = input  (no change)
+    rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF;
+    wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF;
+    wg <<= 8; // restore WG position
  }
+  return rb | wg;
 }

 /*
@@ -92,15 +90,15 @@ uint32_t IRAM_ATTR color_fade(uint32_t c1, uint8_t amount, bool video) {
   note: inputs are 32bit to speed up the function, useful input value ranges are 0-255
 */
 uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten) {
-    if (rgb == 0 | hueShift + lighten + brighten == 0) return rgb; // black or no change
-    CHSV32 hsv;
-    rgb2hsv(rgb, hsv); //convert to HSV
-    hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
-    hsv.s =  max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
-    hsv.v =  min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
-    uint32_t rgb_adjusted;
-    hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
-    return rgb_adjusted;
+  if (rgb == 0 || hueShift + lighten + brighten == 0) return rgb; // black or no change
+  CHSV32 hsv;
+  rgb2hsv(rgb, hsv); //convert to HSV
+  hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
+  hsv.s =  max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
+  hsv.v =  min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
+  uint32_t rgb_adjusted;
+  hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
+  return rgb_adjusted;
 }

 // 1:1 replacement of fastled function optimized for ESP, slightly faster, more accurate and uses less flash (~ -200bytes)
@@ -597,13 +595,13 @@ void NeoGammaWLEDMethod::calcGammaTable(float gamma)
  gammaT_inv[0] = 0;
 }

-uint8_t IRAM_ATTR_YN NeoGammaWLEDMethod::Correct(uint8_t value)
+uint8_t NeoGammaWLEDMethod::Correct(uint8_t value)
 {
  if (!gammaCorrectCol) return value;
  return gammaT[value];
 }

-uint32_t IRAM_ATTR_YN NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
+uint32_t NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
 {
  if (!gammaCorrectCol) return color;
  uint8_t w = W(color);
--- a/wled00/colors.h
+++ b/wled00/colors.h
@@ -117,6 +117,7 @@ class NeoGammaWLEDMethod {
 [[gnu::hot, gnu::pure]] uint32_t color_blend(uint32_t c1, uint32_t c2 , uint8_t blend);
 inline uint32_t color_blend16(uint32_t c1, uint32_t c2, uint16_t b) { return color_blend(c1, c2, b >> 8); };
 [[gnu::hot, gnu::pure]] uint32_t color_add(uint32_t, uint32_t, bool preserveCR = false);
+[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
 [[gnu::hot, gnu::pure]] uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten);
 [[gnu::hot, gnu::pure]] uint32_t ColorFromPaletteWLED(const CRGBPalette16 &pal, unsigned index, uint8_t brightness = (uint8_t)255U, TBlendType blendType = LINEARBLEND);
 CRGBPalette16 generateHarmonicRandomPalette(const CRGBPalette16 &basepalette);
@@ -139,7 +140,13 @@ uint32_t colorBalanceFromKelvin(uint16_t kelvin, uint32_t rgb);
 uint16_t approximateKelvinFromRGB(uint32_t rgb);
 void setRandomColor(byte* rgb);

-[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
+// fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy
+// note: inlining uses less code than actual function calls
+static inline uint32_t fast_color_scale(const uint32_t c, const uint8_t scale) {
+  uint32_t rb = (((c     & 0x00FF00FF) * scale) >> 8) &  0x00FF00FF;
+  uint32_t wg = (((c>>8) & 0x00FF00FF) * scale)       & ~0x00FF00FF;
+  return rb | wg;
+}

 // palettes
 extern const TProgmemRGBPalette16* const fastledPalettes[];
--- a/wled00/const.h
+++ b/wled00/const.h
@@ -682,4 +682,6 @@ static_assert(WLED_MAX_BUSSES <= 32, "WLED_MAX_BUSSES exceeds hard limit");
  #define IRAM_ATTR_YN IRAM_ATTR
 #endif

+#define WLED_O2_ATTR __attribute__((optimize("O2")))
+
 #endif
--- a/wled00/data/settings_leds.htm
+++ b/wled00/data/settings_leds.htm
@@ -45,13 +45,13 @@
 		}
 		function bLimits(b,v,p,m,l,o=5,d=2,a=6) {
 			maxB  = b; // maxB - max physical (analog + digital) buses: 32 - ESP32, 14 - S3/S2, 6 - C3, 4 - 8266
-			maxD  = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
-			maxA  = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
 			maxV  = v; // maxV - min virtual buses: 6 - ESP32/S3, 4 - S2/C3, 3 - ESP8266 (only used to distinguish S2/S3)
 			maxPB = p; // maxPB - max LEDs per bus
 			maxM  = m; // maxM - max LED memory
 			maxL  = l; // maxL - max LEDs (will serve to determine ESP >1664 == ESP32)
 			maxCO = o; // maxCO - max Color Order mappings
+			maxD  = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
+			maxA  = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
 		}
 		function is8266() { return maxA ==  5 && maxD ==  3; } // NOTE: see const.h
 		function is32()   { return maxA == 16 && maxD == 16; } // NOTE: see const.h