lots of tweaks, updated 1D rendering and collisions

- bugfix in mass based 2D collisions - added improved and faster large size rendering to 1D system - added per-particle size rendering to 1D system - improved and simplified collision handling in 1D system - removed local blurring functions in PS as they are not needed anymore for particle rendering - adapted FX to work with the new rendering - fixed outdated AR handling in PS FX - fixed infinite loop if not enough memory - updated PS Hourglass drop interval to simpler math: speed / 10 = time in seconds and improved particle handling - reduced speed in PS Pinball to fix collision slip-through - PS Box now auto-adjusts number of particles based on matrix size and particle size - added safety check to 2D particle rendering to not crash if something goes wrong with out-of bounds particle rendering - improved binning for particle collisions: dont use binning for small number of particles (faster) - Some code cleanup
2025-12-13 19:05:21 +01:00
parent a421cfeabe
commit 19bc3c513a
3 changed files with 422 additions and 444 deletions
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -88,7 +88,7 @@ void ParticleSystem2D::updateFire(const uint8_t intensity,const bool renderonly)

 // set percentage of used particles as uint8_t i.e 127 means 50% for example
 void ParticleSystem2D::setUsedParticles(uint8_t percentage) {
-  usedParticles = (numParticles * ((int)percentage+1)) >> 8; // number of particles to use (percentage is 0-255, 255 = 100%)
+  usedParticles = max((uint32_t)1, (numParticles * ((int)percentage+1)) >> 8); // number of particles to use (percentage is 0-255, 255 = 100%)
  PSPRINT(" SetUsedpaticles: allocated particles: ");
  PSPRINT(numParticles);
  PSPRINT(" ,used particles: ");
@@ -214,7 +214,7 @@ void ParticleSystem2D::flameEmit(const PSsource &emitter) {
 // angle = 0 means in positive x-direction (i.e. to the right)
 int32_t ParticleSystem2D::angleEmit(PSsource &emitter, const uint16_t angle, const int32_t speed) {
  emitter.vx = ((int32_t)cos16_t(angle) * speed) / (int32_t)32600; // cos16_t() and sin16_t() return signed 16bit, division should be 32767 but 32600 gives slightly better rounding
-  emitter.vy = ((int32_t)sin16_t(angle) * speed) / (int32_t)32600; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  emitter.vy = ((int32_t)sin16_t(angle) * speed) / (int32_t)32600; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
  return sprayEmit(emitter);
 }

@@ -236,8 +236,11 @@ void ParticleSystem2D::particleMoveUpdate(PSparticle &part, PSparticleFlags &par
    partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view) note: moving this to checks below adds code and is not faster

    if (perParticleSize && advancedproperties != nullptr) { // using individual particle size
-      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size;
-      particleHardRadius = PS_P_MINHARDRADIUS + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float")
+      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size; // note: single pixel particles should be zero but OOB checks in rendering function handle this
+      if (advancedproperties->size > 0)
+        particleHardRadius = PS_P_MINHARDRADIUS + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float")
+      else // single pixel particles use half the collision distance for walls
+        particleHardRadius = PS_P_MINHARDRADIUS >> 1;
    }
    // note: if wall collisions are enabled, bounce them before they reach the edge, it looks much nicer if the particle does not go half out of view
    if (options->bounceY) {
@@ -446,7 +449,7 @@ void ParticleSystem2D::applyForce(const int8_t xforce, const int8_t yforce) {
 // force is in 3.4 fixed point notation so force=16 means apply v+1 each frame (useful force range is +/- 127)
 void ParticleSystem2D::applyAngleForce(PSparticle &part, const int8_t force, const uint16_t angle, uint8_t &counter) {
  int8_t xforce = ((int32_t)force * cos16_t(angle)) / 32767; // force is +/- 127
-  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
  applyForce(part, xforce, yforce, counter);
 }

@@ -460,7 +463,7 @@ void ParticleSystem2D::applyAngleForce(const uint32_t particleindex, const int8_
 // angle is from 0-65535 (=0-360deg) angle = 0 means in positive x-direction (i.e. to the right)
 void ParticleSystem2D::applyAngleForce(const int8_t force, const uint16_t angle) {
  int8_t xforce = ((int32_t)force * cos16_t(angle)) / 32767; // force is +/- 127
-  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
  applyForce(xforce, yforce);
 }

@@ -543,7 +546,7 @@ void ParticleSystem2D::pointAttractor(const uint32_t particleindex, PSparticle &

  int32_t force = ((int32_t)strength << 16) / distanceSquared;
  int8_t xforce = (force * dx) / 1024; // scale to a lower value, found by experimenting
-  int8_t yforce = (force * dy) / 1024; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = (force * dy) / 1024; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
  applyForce(particleindex, xforce, yforce);
 }

@@ -602,109 +605,16 @@ void ParticleSystem2D::render() {

  // apply 2D blur to rendered frame
  if (smearBlur) {
-    blur2D(framebuffer, maxXpixel + 1, maxYpixel + 1, smearBlur, smearBlur);
+    SEGMENT.blur2D(smearBlur, smearBlur, true);
  }
 }

-// render particle as ellipse/circle with linear brightness falloff and sub-pixel precision
-void WLED_O2_ATTR ParticleSystem2D::renderParticleEllipse(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
-  uint32_t size = particlesize;
-  if (perParticleSize && advPartProps != nullptr) // individual particle size
-    size = advPartProps[particleindex].size;
-
-  // particle position with sub-pixel precision
-  int32_t x_subcenter = particles[particleindex].x;
-  int32_t y_subcenter = particles[particleindex].y;
-
-  // example: for x = 128, a paticle is exacly between pixel 1 and 2, with a radius of 2 pixels, we draw pixels 0-3
-  // integer center jumps when x = 127 -> pixel 1 goes to x = 128 -> pixel 2
-  // when calculating the dx, we need to take this into account: at x = 128 the x offset is 1, the pixel center is at pixel 2:
-  // for pixel 1, dx = 1 * PS_P_RADIUS - 128 = -64 but the center of the pixel is actually only -32 from the particle center so need to add half a radius:
-  // dx = pixel_x * PS_P_RADIUS - x_subcenter + PS_P_HALFRADIUS
-
-  // sub-pixel offset (0-63)
-  int32_t x_offset = x_subcenter & (PS_P_RADIUS - 1); // same as modulo PS_P_RADIUS but faster
-  int32_t y_offset = y_subcenter & (PS_P_RADIUS - 1);
-  // integer pixel position, this is rounded down
-  int32_t x_center = (x_subcenter) >> PS_P_RADIUS_SHIFT;
-  int32_t y_center = (y_subcenter) >> PS_P_RADIUS_SHIFT;
-
-  // ellipse radii in pixels
-  uint32_t xsize = size;
-  uint32_t ysize = size;
-  if (advPartSize != nullptr && advPartSize[particleindex].asymmetry > 0) {
-    getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
-  }
-
-  int32_t rx_subpixel = xsize+65; // size = 1 means radius of just over 1 pixel
-  int32_t ry_subpixel = ysize+65; // size = 255 is radius of 5, so add 65 -> 65+255=320, 320>>6=5 pixels
-
-  // rendering bounding box in pixels
-  int32_t rx_pixels = (rx_subpixel >> PS_P_RADIUS_SHIFT);
-  int32_t ry_pixels = (ry_subpixel >> PS_P_RADIUS_SHIFT);
-
-  int32_t x_min = x_center - rx_pixels;
-  int32_t x_max = x_center + rx_pixels;
-  int32_t y_min = y_center - ry_pixels;
-  int32_t y_max = y_center + ry_pixels;
-
-  // cache for speed
-  uint32_t matrixX = maxXpixel + 1;
-  uint32_t matrixY = maxYpixel + 1;
-  uint32_t rx_sq = rx_subpixel * rx_subpixel;
-  uint32_t ry_sq = ry_subpixel * ry_subpixel;
-
-  // iterate over bounding box and render each pixel
-  for (int32_t py = y_min; py <= y_max; py++) {
-    for (int32_t px = x_min; px <= x_max; px++) {
-      // distance from particle center, explanation see above
-      int32_t dx_subpixel = (px << PS_P_RADIUS_SHIFT) - x_subcenter + PS_P_HALFRADIUS;
-      int32_t dy_subpixel = (py << PS_P_RADIUS_SHIFT) - y_subcenter + PS_P_HALFRADIUS;
-
-      // calculate brightness based on squared distance to ellipse center
-      uint8_t pixel_brightness = calculateEllipseBrightness(dx_subpixel, dy_subpixel, rx_sq, ry_sq, brightness);
-
-      if (pixel_brightness == 0) continue; // Skip fully transparent pixels
-
-      // apply inverse gamma correction if needed, if this is skipped, particles flicker due to changing total brightness
-      if (gammaCorrectCol) {
-        pixel_brightness = gamma8inv(pixel_brightness); // invert brigthess so brightness distribution is linear after gamma correction
-      }
-
-      // Handle wrapping and bounds
-      int32_t render_x = px;
-      int32_t render_y = py;
-
-      // Check bounds and apply wrapping
-      if (render_x < 0) {
-        if (!wrapX) continue;
-        render_x += matrixX;
-      } else if (render_x > maxXpixel) {
-        if (!wrapX) continue;
-        render_x -= matrixX;
-      }
-
-      if (render_y < 0) {
-        if (!wrapY) continue;
-        render_y += matrixY;
-      } else if (render_y > maxYpixel) {
-        if (!wrapY) continue;
-        render_y -= matrixY;
-      }
-      // Render pixel
-      uint32_t idx = render_x + (maxYpixel - render_y) * matrixX; // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pixel_brightness);
-    }
-  }
-}
-
-
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
  uint32_t size = particlesize;

  if (perParticleSize && advPartProps != nullptr) // use advanced size properties
-    size = advPartProps[particleindex].size;
+    size = 1 + advPartProps[particleindex].size; // add 1 to avoid single pixel size particles (collisions do not support it)

  if (size == 0) { // single pixel rendering
    uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT;
@@ -717,7 +627,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
  }

  if (size > 1) { // size > 1: render as ellipse
-    renderParticleEllipse(particleindex, brightness, color, wrapX, wrapY); // larger size rendering
+    renderLargeParticle(size, particleindex, brightness, color, wrapX, wrapY); // larger size rendering
    return;
  }

@@ -760,19 +670,19 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
  // - apply inverse gamma correction to brightness values
  // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
  if (gammaCorrectCol) {
-    pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
-    pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
-    pxlbrightness[2] = gamma8inv(pxlbrightness[2]);
-    pxlbrightness[3] = gamma8inv(pxlbrightness[3]);
+    for (uint32_t i = 0; i < 4; i++) {
+      pxlbrightness[i] = gamma8inv(pxlbrightness[i]); // use look-up-table for invers gamma
+    }
  }

  // standard rendering (2x2 pixels)
  // check for out of frame pixels and wrap them if required: x,y is bottom left pixel coordinate of the particle
-  if (x < 0) { // left pixels out of frame
+  if (pixco[0].x < 0) { // left pixels out of frame
    if (wrapX) { // wrap x to the other side if required
      pixco[0].x = pixco[3].x = maxXpixel;
    } else {
      pixelvalid[0] = pixelvalid[3] = false; // out of bounds
+      if (pixco[0].x < -1) return; // both left pixels out of bounds, no need to continue (safety check)
    }
  }
  else if (pixco[1].x > (int32_t)maxXpixel) { // right pixels, only has to be checked if left pixel is in frame
@@ -780,14 +690,16 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
      pixco[1].x = pixco[2].x = 0;
    } else {
      pixelvalid[1] = pixelvalid[2] = false; // out of bounds
+      if (pixco[0].x > (int32_t)maxXpixel) return; // both pixels out of bounds, no need to continue (safety check)
    }
  }

-  if (y < 0) { // bottom pixels out of frame
+  if (pixco[0].y < 0) { // bottom pixels out of frame
    if (wrapY) { // wrap y to the other side if required
      pixco[0].y = pixco[1].y = maxYpixel;
    } else {
      pixelvalid[0] = pixelvalid[1] = false; // out of bounds
+      if (pixco[0].y < -1) return; // both bottom pixels out of bounds, no need to continue (safety check)
    }
  }
  else if (pixco[2].y > maxYpixel) { // top pixels
@@ -795,6 +707,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
      pixco[2].y = pixco[3].y = 0;
    } else {
      pixelvalid[2] = pixelvalid[3] = false; // out of bounds
+      if (pixco[2].y > (int32_t)maxYpixel + 1) return; // both top pixels out of bounds, no need to continue (safety check)
    }
  }
  for (uint32_t i = 0; i < 4; i++) {
@@ -805,32 +718,123 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
  }
 }

+// render particle as ellipse/circle with linear brightness falloff and sub-pixel precision
+void WLED_O2_ATTR ParticleSystem2D::renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
+  // particle position with sub-pixel precision
+  int32_t x_subcenter = particles[particleindex].x;
+  int32_t y_subcenter = particles[particleindex].y;
+
+  // example: for x = 128, a paticle is exacly between pixel 1 and 2, with a radius of 2 pixels, we draw pixels 0-3
+  // integer center jumps when x = 127 -> pixel 1 goes to x = 128 -> pixel 2
+  // when calculating the dx, we need to take this into account: at x = 128 the x offset is 1, the pixel center is at pixel 2:
+  // for pixel 1, dx = 1 * PS_P_RADIUS - 128 = -64 but the center of the pixel is actually only -32 from the particle center so need to add half a radius:
+  // dx = pixel_x * PS_P_RADIUS - x_subcenter + PS_P_HALFRADIUS
+
+  // sub-pixel offset (0-63)
+  int32_t x_offset = x_subcenter & (PS_P_RADIUS - 1); // same as modulo PS_P_RADIUS but faster
+  int32_t y_offset = y_subcenter & (PS_P_RADIUS - 1);
+  // integer pixel position, this is rounded down
+  int32_t x_center = (x_subcenter) >> PS_P_RADIUS_SHIFT;
+  int32_t y_center = (y_subcenter) >> PS_P_RADIUS_SHIFT;
+
+  // ellipse radii in pixels
+  uint32_t xsize = size;
+  uint32_t ysize = size;
+  if (advPartSize != nullptr && advPartSize[particleindex].asymmetry > 0) {
+    getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
+  }
+
+  int32_t rx_subpixel = xsize + PS_P_RADIUS + 1; // size = 1 means radius of just over 1 pixel, + PS_P_RADIUS (+1 to accoutn for bit-shift loss)
+  int32_t ry_subpixel = ysize + PS_P_RADIUS + 1; // size = 255 is radius of 5, so add 65 -> 65+255=320, 320>>6=5 pixels
+
+  // rendering bounding box in pixels
+  int32_t rx_pixels = (rx_subpixel >> PS_P_RADIUS_SHIFT);
+  int32_t ry_pixels = (ry_subpixel >> PS_P_RADIUS_SHIFT);
+
+  int32_t x_min = x_center - rx_pixels; // note: the "+1" extension needed for 1D is not required for 2D, it is smooth as-is
+  int32_t x_max = x_center + rx_pixels;
+  int32_t y_min = y_center - ry_pixels;
+  int32_t y_max = y_center + ry_pixels;
+
+  // cache for speed
+  uint32_t matrixX = maxXpixel + 1;
+  uint32_t matrixY = maxYpixel + 1;
+  uint32_t rx_sq = rx_subpixel * rx_subpixel;
+  uint32_t ry_sq = ry_subpixel * ry_subpixel;
+
+  // iterate over bounding box and render each pixel
+  for (int32_t py = y_min; py <= y_max; py++) {
+    for (int32_t px = x_min; px <= x_max; px++) {
+      // Check bounds and apply wrapping
+      int32_t render_x = px;
+      int32_t render_y = py;
+      if (render_x < 0) {
+        if (!wrapX) continue;
+        render_x += matrixX;
+      } else if (render_x > maxXpixel) {
+        if (!wrapX) continue;
+        render_x -= matrixX;
+      }
+
+      if (render_y < 0) {
+        if (!wrapY) continue;
+        render_y += matrixY;
+      } else if (render_y > maxYpixel) {
+        if (!wrapY) continue;
+        render_y -= matrixY;
+      }
+
+      // distance from particle center, explanation see above
+      int32_t dx_subpixel = (px << PS_P_RADIUS_SHIFT) - x_subcenter + PS_P_HALFRADIUS;
+      int32_t dy_subpixel = (py << PS_P_RADIUS_SHIFT) - y_subcenter + PS_P_HALFRADIUS;
+
+      // calculate brightness based on squared distance to ellipse center
+      uint8_t pixel_brightness = calculateEllipseBrightness(dx_subpixel, dy_subpixel, rx_sq, ry_sq, brightness);
+
+      if (pixel_brightness == 0) continue; // skip black pixels
+
+      // apply inverse gamma correction if needed, if this is skipped, particles flicker due to changing total brightness
+      if (gammaCorrectCol) {
+        pixel_brightness = gamma8inv(pixel_brightness); // invert brigthess so brightness distribution is linear after gamma correction
+      }
+
+      // Render pixel
+      uint32_t idx = render_x + (maxYpixel - render_y) * matrixX; // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
+      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pixel_brightness);
+    }
+  }
+}
+
 // detect collisions in an array of particles and handle them
 // uses binning by dividing the frame into slices in x direction which is efficient if using gravity in y direction (but less efficient for FX that use forces in x direction)
 // for code simplicity, no y slicing is done, making very tall matrix configurations less efficient
 // note: also tested adding y slicing, it gives diminishing returns, some FX even get slower. FX not using gravity would benefit with a 10% FPS improvement
 void ParticleSystem2D::handleCollisions() {
-  if (perParticleSize && advPartProps != nullptr)
-    particleHardRadius = 255; // max radius for collision detection if using per-particle size TODO: could optimize by fetching max size from advPartProps
-
  uint32_t collDistSq = particleHardRadius << 1; // distance is double the radius note: particleHardRadius is updated when setting global particle size
  collDistSq = collDistSq * collDistSq; // square it for faster comparison (square is one operation)
  // note: partices are binned in x-axis, assumption is that no more than half of the particles are in the same bin
  // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
-  constexpr int BIN_WIDTH = 6 * PS_P_RADIUS; // width of a bin in sub-pixels
+  int binWidth = 6 * PS_P_RADIUS; // width of a bin in sub-pixels
  int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
+  if (perParticleSize && advPartProps != nullptr)
+    overlap = 512; // max overlap for collision detection if using per-particle size, enough to catch all particles even at max speed
+
  uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 2); // assume no more than half of the particles are in the same bin, do not bin small amounts of particles
-  uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // number of bins in x direction
+  uint32_t numBins = (maxX + (binWidth - 1)) / binWidth; // number of bins in x direction
+  if (usedParticles < maxBinParticles) {
+    numBins = 1; // use single bin for small number of particles
+    binWidth = maxX + 1;
+  }
  uint16_t binIndices[maxBinParticles]; // creat array on stack for indices, 2kB max for 1024 particles (ESP32_MAXPARTICLES/2)
  uint32_t binParticleCount; // number of particles in the current bin
-  uint16_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
+  uint32_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
  uint32_t pidx = collisionStartIdx; //start index in case a bin is full, process remaining particles next frame

  // fill the binIndices array for this bin
  for (uint32_t bin = 0; bin < numBins; bin++) {
    binParticleCount = 0; // reset for this bin
-    int32_t binStart = bin * BIN_WIDTH - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
-    int32_t binEnd = binStart + BIN_WIDTH + overlap; // note: last bin can be out of bounds, see above;
+    int32_t binStart = bin * binWidth - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
+    int32_t binEnd = binStart + binWidth + overlap; // note: last bin can be out of bounds, see above;

    // fill the binIndices array for this bin
    for (uint32_t i = 0; i < usedParticles; i++) {
@@ -849,8 +853,8 @@ void ParticleSystem2D::handleCollisions() {
      if (pidx >= usedParticles) pidx = 0; // wrap around
    }

-    uint32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
-    uint32_t massratio2 = 0;
+    int32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
+    int32_t massratio2 = 0; // TODO: if implementing "fixed" particles, set to 1 (fixed) and 255 (movable)
    for (uint32_t i = 0; i < binParticleCount; i++) { // go though all 'higher number' particles in this bin and see if any of those are in close proximity and if they are, make them collide
      uint32_t idx_i = binIndices[i];
      for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
@@ -859,12 +863,15 @@ void ParticleSystem2D::handleCollisions() {
          collDistSq = (PS_P_MINHARDRADIUS << 1) + ((((uint32_t)advPartProps[idx_i].size + (uint32_t)advPartProps[idx_j].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
          collDistSq = collDistSq * collDistSq; // square it for faster comparison
          // calculate mass ratio for collision response
-          uint32_t mass1 = 1 + ((uint32_t)advPartProps[idx_i].size * advPartProps[idx_i].size); // +1 to avoid division by zero
-          uint32_t mass2 = ((uint32_t)advPartProps[idx_j].size * advPartProps[idx_j].size);
+          uint32_t mass1 = PS_P_RADIUS + advPartProps[idx_i].size;
+          uint32_t mass2 = PS_P_RADIUS + advPartProps[idx_j].size;
+          mass1 = mass1 * mass1; // mass proportional to area
+          mass2 = mass2 * mass2;
          uint32_t totalmass = mass1 + mass2;
          massratio1 = (mass2 << 8) / totalmass; // massratio 1 depends on mass of particle 2, i.e. if 2 is heavier -> higher velocity impact on 1
          massratio2 = (mass1 << 8) / totalmass;
        }
+        // note: using the same logic as in 1D is much slower though it would be more accurate but it is not really needed in 2D
        int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance with lookahead
        if (dx * dx < collDistSq) { // check x direction, if close, check y direction (squaring is faster than abs() or dual compare)
          int32_t dy = (particles[idx_j].y + particles[idx_j].vy)  - (particles[idx_i].y + particles[idx_i].vy); // distance with lookahead
@@ -879,7 +886,7 @@ void ParticleSystem2D::handleCollisions() {

 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, uint32_t massratio1, uint32_t massratio2) {
+void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, int32_t massratio1, int32_t massratio2) {
  int32_t distanceSquared = dx * dx + dy * dy;
  // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
  int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
@@ -909,24 +916,29 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
  if (dotProduct < 0) {// particles are moving towards each other
    // integer math is much faster than using floats (float divisions are slow on all ESPs)
    // overflow check: dx/dy are 7bit, relativV are 8bit -> dotproduct is 15bit, dotproduct/distsquared ist 8b, multiplied by collisionhardness of 8bit. so a 16bit shift is ok, make it 15 to be sure no overflows happen
-    // note: cannot use right shifts as bit shifting in right direction is asymmetrical for positive and negative numbers and this needs to be accurate! the trick is: only shift positive numers
+    // note: cannot use right shifts as bit shifting in right direction is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate! the trick is: only shift positive numers
    // Calculate new velocities after collision
    int32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS >> 1); // if particles are soft, the impulse must stay above a limit or collisions slip through at higher speeds, 170 seems to be a good value
    int32_t impulse = (((((-dotProduct) << 15) / distanceSquared) * surfacehardness) >> 8); // note: inverting before bitshift corrects for asymmetry in right-shifts (is slightly faster)

    #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-    int32_t ximpulse = (impulse * dx + ((dx >> 31) & 32767)) >> 15; // note: extracting sign bit and adding rounding value to correct for asymmetry in right shifts
-    int32_t yimpulse = (impulse * dy + ((dy >> 31) & 32767)) >> 15;
+    int32_t ximpulse = (impulse * dx + ((dx >> 31) & 0x7FFF)) >> 15; // note: extracting sign bit and adding rounding value to correct for asymmetry in right shifts
+    int32_t yimpulse = (impulse * dy + ((dy >> 31) & 0x7FFF)) >> 15;
    #else
    int32_t ximpulse = (impulse * dx) / 32767;
    int32_t yimpulse = (impulse * dy) / 32767;
    #endif
    // if particles are not the same size, use a mass ratio. mass ratio is set to 0 if particles are the same size
    if (massratio1) {
-      particle1.vx -= (ximpulse * massratio1) >> 7; // mass ratio is in fixed point 8bit, multiply by two to account for the fact that we distribute the impulse to both particles
-      particle1.vy -= (yimpulse * massratio1) >> 7;
-      particle2.vx += (ximpulse * massratio2) >> 7;
-      particle2.vy += (yimpulse * massratio2) >> 7;
+      int32_t vx1 = (int32_t)particle1.vx - ((ximpulse * massratio1) >> 7); // mass ratio is in fixed point 8bit, multiply by two to account for the fact that we distribute the impulse to both particles
+      int32_t vy1 = (int32_t)particle1.vy - ((yimpulse * massratio1) >> 7);
+      int32_t vx2 = (int32_t)particle2.vx + ((ximpulse * massratio2) >> 7);
+      int32_t vy2 = (int32_t)particle2.vy + ((yimpulse * massratio2) >> 7);
+      // limit speeds to max speed (required if a lot of impulse is transferred from a large to a small particle)
+      particle1.vx = limitSpeed(vx1);
+      particle1.vy = limitSpeed(vy1);
+      particle2.vx = limitSpeed(vx2);
+      particle2.vy = limitSpeed(vy2);
    }
    else {
      particle1.vx -= ximpulse; // note: impulse is inverted, so subtracting it
@@ -951,11 +963,11 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
    }

    // particles have volume, push particles apart if they are too close
-    // tried lots of configurations, it works best if not moved but given a little velocity, it tends to oscillate less this way
+    // tried lots of configurations, it works best if given a little velocity, it tends to oscillate less this way
    // when hard pushing by offsetting position, they sink into each other under gravity
    // a problem with giving velocity is, that on harder collisions, this adds up as it is not dampened enough, so add friction in the FX if required
    if (distanceSquared < collDistSq && dotProduct > -250) { // too close and also slow, push them apart
-      int32_t notsorandom = dotProduct & 0x01; //dotprouct LSB should be somewhat random, so no need to calculate a random number
+      bool fairlyrandom = dotProduct & 0x01; //dotprouct LSB should be somewhat random, so no need to calculate a random number
      int32_t pushamount = 1 + ((250 + dotProduct) >> 6); // the closer dotproduct is to zero, the closer the particles are
      int32_t push = 0;
      if (dx < 0)  // particle 1 is on the right
@@ -963,7 +975,7 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
      else if (dx > 0)
        push = -pushamount;
      else { // on the same x coordinate, shift it a little so they do not stack
-        if (notsorandom)
+        if (fairlyrandom)
          particle1.x++; // move it so pile collapses
        else
          particle1.x--;
@@ -975,7 +987,7 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
      else if (dy > 0)
        push = -pushamount;
      else { // dy==0
-        if (notsorandom)
+        if (fairlyrandom)
          particle1.y++; // move it so pile collapses
        else
          particle1.y--;
@@ -1037,56 +1049,6 @@ void ParticleSystem2D::updatePSpointers(bool isadvanced, bool sizecontrol) {

 }

-// blur a matrix in x and y direction, blur can be asymmetric in x and y
-// for speed, 1D array and 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
-// to blur a subset of the buffer, change the xsize/ysize and set xstart/ystart to the desired starting coordinates (default start is 0/0)
-// subset blurring only works on 10x10 buffer (single particle rendering), if other sizes are needed, buffer width must be passed as parameter
-void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur, uint32_t yblur, uint32_t xstart, uint32_t ystart, bool isparticle) {
-  CRGBW seeppart, carryover;
-  uint32_t seep = xblur >> 1;
-  uint32_t width = xsize; // width of the buffer, used to calculate the index of the pixel
-
-  if (isparticle) { //first and last row are always black in first pass of particle rendering
-    ystart++;
-    ysize--;
-    width = 10; // buffer size is 10x10
-  }
-
-  for (uint32_t y = ystart; y < ystart + ysize; y++) {
-    carryover =  BLACK;
-    uint32_t indexXY = xstart + y * width;
-    for (uint32_t x = xstart; x < xstart + xsize; x++) {
-      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
-      if (x > 0) {
-        colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart);
-        colorbuffer[indexXY]     = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
-      }
-      carryover = seeppart;
-      indexXY++; // next pixel in x direction
-    }
-  }
-
-  if (isparticle) { // first and last row are now smeared
-    ystart--;
-    ysize++;
-  }
-
-  seep = yblur >> 1;
-  for (uint32_t x = xstart; x < xstart + xsize; x++) {
-    carryover = BLACK;
-    uint32_t indexXY = x + ystart * width;
-    for (uint32_t y = ystart; y < ystart + ysize; y++) {
-      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
-      if (y > 0) {
-        colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart);
-        colorbuffer[indexXY]         = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
-      }
-      carryover = seeppart;
-      indexXY += width; // next pixel in y direction
-    }
-  }
-}
-
 //non class functions to use for initialization
 uint32_t calculateNumberOfParticles2D(uint32_t const pixels, const bool isadvanced, const bool sizecontrol) {
  uint32_t numberofParticles = pixels;  // 1 particle per pixel (for example 512 particles on 32x16)
@@ -1142,7 +1104,7 @@ bool initParticleSystem2D(ParticleSystem2D *&PartSys, uint32_t requestedsources,
  PSPRINTLN(" request numparticles:" + String(numparticles));
  uint32_t numsources = calculateNumberOfSources2D(pixels, requestedsources);
  bool allocsuccess = false;
-  while(numparticles >= 4) { // make sure we have at least 4 particles or quit
+  while(numparticles >= 5) { // make sure we have at least 5 particles or quit
    if (allocateParticleSystemMemory2D(numparticles, numsources, advanced, sizecontrol, additionalbytes)) {
      PSPRINTLN(F("PS 2D alloc succeeded"));
      allocsuccess = true;
@@ -1205,8 +1167,11 @@ void ParticleSystem1D::update(void) {
    applyGravity();

  // handle collisions (can push particles, must be done before updating particles or they can render out of bounds, causing a crash if using local buffer for speed)
-  if (particlesettings.useCollisions)
+  if (particlesettings.useCollisions) {
    handleCollisions();
+    if (perParticleSize)
+      handleCollisions(); // second pass for per particle size (as impulse transfer can recoil at high speed, this improves "slip through" issues for small particles but is expensive)
+  }

  //move all particles
  for (uint32_t i = 0; i < usedParticles; i++) {
@@ -1214,7 +1179,7 @@ void ParticleSystem1D::update(void) {
  }

  if (particlesettings.colorByPosition) {
-    uint32_t scale = (255 << 16) / maxX;  // speed improvement: multiplication is faster than division
+    uint32_t scale = (255 << 16) / maxX;
    for (uint32_t i = 0; i < usedParticles; i++) {
      particles[i].hue = (scale * particles[i].x) >> 16; // note: x is > 0 if not out of bounds
    }
@@ -1225,7 +1190,7 @@ void ParticleSystem1D::update(void) {

 // set percentage of used particles as uint8_t i.e 127 means 50% for example
 void ParticleSystem1D::setUsedParticles(const uint8_t percentage) {
-  usedParticles = (numParticles * ((int)percentage+1)) >> 8; // number of particles to use (percentage is 0-255, 255 = 100%)
+  usedParticles =  max((uint32_t)1, (numParticles * ((int)percentage+1)) >> 8); // number of particles to use (percentage is 0-255, 255 = 100%)
  PSPRINT(" SetUsedpaticles: allocated particles: ");
  PSPRINT(numParticles);
  PSPRINT(" ,used particles: ");
@@ -1269,10 +1234,16 @@ void ParticleSystem1D::setSmearBlur(const uint8_t bluramount) {
  smearBlur = bluramount;
 }

-// render size, 0 = 1 pixel, 1 = 2 pixel (interpolated), bigger sizes require adanced properties
+// render size, 0 = 1 pixel, 1 = 2 pixel (interpolated), 255 = 18 pixel diameter
 void ParticleSystem1D::setParticleSize(const uint8_t size) {
-  particlesize = size > 0 ? 1 : 0; // TODO: add support for global sizes? see note above (motion blur)
-  particleHardRadius = PS_P_MINHARDRADIUS_1D >> (!particlesize); // 2 pixel sized particles or single pixel sized particles
+  particlesize = size;
+  particleHardRadius = PS_P_MINHARDRADIUS_1D; // ~1 pixel
+  perParticleSize = false; // disable per particle size control if global size is set
+  if (particlesize > 1) {
+    particleHardRadius = PS_P_MINHARDRADIUS_1D + ((particlesize * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
+  }
+  else if (particlesize == 0)
+    particleHardRadius = particleHardRadius >> 1; // single pixel particles have half the radius (i.e. 1/2 pixel)
 }

 // enable/disable gravity, optionally, set the force (force=8 is default) can be -127 to +127, 0 is disable
@@ -1328,16 +1299,16 @@ void ParticleSystem1D::particleMoveUpdate(PSparticle1D &part, PSparticleFlags1D
    if (options->colorByAge)
      part.hue = min(part.ttl, (uint16_t)255); // set color to ttl

-    int32_t renderradius = PS_P_HALFRADIUS_1D; // used to check out of bounds, default for 2 pixel rendering
+    int32_t renderradius = PS_P_HALFRADIUS_1D - 1 + particlesize; // used to check out of bounds, default for 2 pixel rendering
    int32_t newX = part.x + (int32_t)part.vx;
    partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view)

-    if (advancedproperties) { // using individual particle size?
+    if (perParticleSize && advancedproperties != nullptr) { // using individual particle size?
+      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
      if (advancedproperties->size > 1)
-        particleHardRadius = PS_P_MINHARDRADIUS_1D + (advancedproperties->size >> 1);
+        particleHardRadius = PS_P_MINHARDRADIUS_1D + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
      else // single pixel particles use half the collision distance for walls
        particleHardRadius = PS_P_MINHARDRADIUS_1D >> 1;
-      renderradius = particleHardRadius; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
    }

    // if wall collisions are enabled, bounce them before they reach the edge, it looks much nicer if the particle is not half out of view
@@ -1493,7 +1464,7 @@ void ParticleSystem1D::render() {
  }
  // apply smear-blur to rendered frame
  if (smearBlur) {
-    blur1D(framebuffer, maxXpixel + 1, smearBlur, 0);
+    SEGMENT.blur(smearBlur, true);
  }

  // add background color
@@ -1517,8 +1488,8 @@ void ParticleSystem1D::render() {
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
  uint32_t size = particlesize;
-  if (advPartProps != nullptr) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
-    size = advPartProps[particleindex].size;
+  if (perParticleSize && advPartProps != nullptr) // use advanced size properties
+    size = 1 + advPartProps[particleindex].size; // add 1 to avoid single pixel size particles (collisions do not support it)

  if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
    uint32_t x =  particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
@@ -1528,6 +1499,12 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
    return;
  }
  //render larger particles
+  if (size > 1) { // size > 1: render as gradient line
+    renderLargeParticle(size, particleindex, brightness, color, wrap); // larger size rendering
+    return;
+  }
+
+  // standard rendering (2 pixels per particle)
  bool pxlisinframe[2] = {true, true};
  int32_t pxlbrightness[2];
  int32_t pixco[2]; // physical pixel coordinates of the two pixels representing a particle
@@ -1548,99 +1525,110 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
  // adjust brightness such that distribution is linear after gamma correction:
  // - scale brigthness with gamma correction (done in render())
  // - apply inverse gamma correction to brightness values
-  // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
+  // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total -> fixes brightness fluctuations
  if (gammaCorrectCol) {
    pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
    pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
  }
-  // check if particle has advanced size properties and buffer is available
-  if (advPartProps != nullptr && advPartProps[particleindex].size > 1) {
-    uint32_t renderbuffer[10]; // 10 pixel buffer
-    memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
-    //render particle to a bigger size
-    //particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
-    //first, render the pixel to the center of the renderbuffer, then apply 1D blurring
-    renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]);
-    renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]);
-    uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
-    uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
-    uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
-    uint32_t bitshift = 0;
-    for (uint32_t i = 0; i < blurpasses; i++) {
-      if (i == 2) //for the last two passes, use higher amount of blur (results in a nicer brightness gradient with soft edges)
-        bitshift = 1;
-      rendersize += 2;
-      offset--;
-      blur1D(renderbuffer, rendersize, size << bitshift, offset);
-      size = size > 64 ? size - 64 : 0;
-    }

-    // calculate origin coordinates to render the particle to in the framebuffer
-    uint32_t xfb_orig = x - (rendersize>>1) + 1 - offset; //note: using uint is fine
-    uint32_t xfb; // coordinates in frame buffer to write to note: by making this uint, only overflow has to be checked
-
-    // transfer particle renderbuffer to framebuffer
-    for (uint32_t xrb = offset; xrb < rendersize+offset; xrb++) {
-      xfb = xfb_orig + xrb;
-      if (xfb > (uint32_t)maxXpixel) {
-        if (wrap) { // wrap x to the other side if required
-          if (xfb > (uint32_t)maxXpixel << 1) // xfb is "negative"
-            xfb = (maxXpixel + 1) + (int32_t)xfb; // this always overflows to within bounds
-          else
-            xfb = xfb % (maxXpixel + 1); // note: without the above "negative" check, this works only for powers of 2
-        }
-        else
-          continue;
-      }
-      #ifdef ESP8266 // no local buffer on ESP8266
-      SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
-      #else
-      framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]);
-      #endif
+  // check if any pixels are out of frame
+  if (pixco[0] < 0) { // left pixels out of frame
+    if (wrap) // wrap x to the other side if required
+      pixco[0] = maxXpixel;
+    else {
+      pxlisinframe[0] = false; // pixel is out of matrix boundaries, do not render
+      if (pixco[0] < -1)
+        return; // both pixels out of frame (safety check)
    }
  }
-  else { // standard rendering (2 pixels per particle)
-    // check if any pixels are out of frame
-    if (x < 0) { // left pixels out of frame
-      if (wrap) // wrap x to the other side if required
-        pixco[0] = maxXpixel;
-      else
-        pxlisinframe[0] = false; // pixel is out of matrix boundaries, do not render
-    }
-    else if (pixco[1] > (int32_t)maxXpixel) { // right pixel, only has to be checkt if left pixel did not overflow
-      if (wrap) // wrap y to the other side if required
-        pixco[1] = 0;
-      else
-        pxlisinframe[1] = false;
-    }
-    for (uint32_t i = 0; i < 2; i++) {
-      if (pxlisinframe[i]) {
-        framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
-      }
+  else if (pixco[1] > (int32_t)maxXpixel) { // right pixel, only has to be checkt if left pixel did not overflow
+    if (wrap) // wrap y to the other side if required
+      pixco[1] = 0;
+    else {
+      pxlisinframe[1] = false;
+      if (pixco[0] > (int32_t)maxXpixel)
+        return; // both pixels out of frame (safety check)
    }
  }
+  for (uint32_t i = 0; i < 2; i++) {
+    if (pxlisinframe[i]) {
+      framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
+    }
+  }
+}

+// render particle as a line with linear brightness falloff and sub-pixel precision, size is in 0-255 (1-9 pixel radius)
+void WLED_O2_ATTR ParticleSystem1D::renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrap) {
+  int32_t x_subcenter = particles[particleindex].x; // particle position in sub-pixel space
+
+  // sub-pixel offset (0-31)
+  int32_t x_offset = x_subcenter & (PS_P_RADIUS_1D - 1); // same as modulo PS_P_RADIUS but faster
+  int32_t x_center = x_subcenter >> PS_P_RADIUS_SHIFT_1D; // integer pixel position, this is rounded down
+
+  // particle radius in pixels, size = 1 means radius of just over 1 pixel
+  int32_t r_subpixel = size + PS_P_RADIUS_1D + 1; // size = 255 is radius of 9, so add 33 -> 33+255=288, 288>>5=9 pixels (i.e. the +1 is needed to correct for bitshift losses)
+  // rendering bounding box in pixels
+  int32_t r_pixels = r_subpixel >> PS_P_RADIUS_SHIFT_1D;
+
+  int32_t x_min = x_center - r_pixels - 1; // extend by one for much smoother movement
+  int32_t x_max = x_center + r_pixels + 1;
+
+  // cache for speed
+  uint32_t matrixX = maxXpixel + 1;
+
+  // iterate over bounding box and render each pixel
+  for (int32_t px = x_min; px <= x_max; px++) {
+    // Check bounds and apply wrapping
+    int32_t render_x = px;
+    if (render_x < 0) {
+      if (!wrap) continue; // skip out of frame pixels
+      render_x += matrixX;
+    } else if (render_x > maxXpixel) {
+      if (!wrap) continue;
+      render_x -= matrixX;
+    }
+    // squared distance from particle center
+    int32_t dx_sq = ((px << PS_P_RADIUS_SHIFT_1D) - x_subcenter + PS_P_HALFRADIUS_1D); // explanation see 2D version
+    dx_sq = dx_sq * dx_sq;
+    int32_t rx_sq = r_subpixel * r_subpixel;
+    uint32_t dist_sq = (dx_sq << 8) / rx_sq; // normalized squared distance in fixed point (0-256)
+
+    // calculate brightness based on distance from particle center with linear falloff
+    uint8_t pixel_brightness = dist_sq >= 256 ? 0 : ((256 - dist_sq) * brightness) >> 8;
+    //if (pixel_brightness == 0) continue; // skip black pixels note: very few pixels will be black, skipping this is usually faster
+
+    // Render pixel
+    framebuffer[render_x] = fast_color_scaleAdd(framebuffer[render_x], color, pixel_brightness);
+  }
 }

 // detect collisions in an array of particles and handle them
 void ParticleSystem1D::handleCollisions() {
-  uint32_t collisiondistance = particleHardRadius << 1;
+  uint32_t collisiondistance = particleHardRadius << 1; // twice the radius is min distance between colliding particles
+  uint32_t checkDistSq = max(2 * PS_P_MAXSPEED, (int)collisiondistance);
+  if (perParticleSize && advPartProps != nullptr) // using individual particle size
+    checkDistSq = max(2 * PS_P_MAXSPEED, (512 * 52) >> 6); // max possible collision distance that catches all collisons
+  checkDistSq = checkDistSq * checkDistSq; // square it for distance comparison (faster than abs() )
  // note: partices are binned by position, assumption is that no more than half of the particles are in the same bin
  // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
-  constexpr int BIN_WIDTH = 32 * PS_P_RADIUS_1D; // width of each bin, a compromise between speed and accuracy (larger bins are faster but collapse more)
-  int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
-  if (advPartProps != nullptr) //may be using individual particle size
-    overlap += 256; // add 2 * max radius (approximately)
+  int binWidth = 64 * PS_P_RADIUS_1D; // width of each bin, a compromise between speed and accuracy
+  int32_t overlap = collisiondistance + (2 * PS_P_MAXSPEED); // overlap bins to include edge particles to neighbouring bins (+ look-ahead of speed)
+  if (perParticleSize && advPartProps != nullptr) //may be using individual particle size
+    overlap = 512; // 2 * max radius, enough to catch all collisions even at full speed
  uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 4); // do not bin small amounts, limit max to 1/4 of particles
-  uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // calculate number of bins
+  uint32_t numBins = (maxX + (binWidth - 1)) / binWidth; // calculate number of bins
+  if (usedParticles < maxBinParticles) {
+    numBins = 1; // use single bin for small number of particles
+    binWidth = maxX + 1;
+  }
  uint16_t binIndices[maxBinParticles]; // array to store indices of particles in a bin
  uint32_t binParticleCount; // number of particles in the current bin
-  uint16_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
+  uint32_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
  uint32_t pidx = collisionStartIdx; //start index in case a bin is full, process remaining particles next frame
  for (uint32_t bin = 0; bin < numBins; bin++) {
    binParticleCount = 0; // reset for this bin
-    int32_t binStart = bin * BIN_WIDTH - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
-    int32_t binEnd = binStart + BIN_WIDTH + overlap; // note: last bin can be out of bounds, see above
+    int32_t binStart = bin * binWidth - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
+    int32_t binEnd = binStart + binWidth + overlap; // note: last bin can be out of bounds, see above

    // fill the binIndices array for this bin
    for (uint32_t i = 0; i < usedParticles; i++) {
@@ -1663,87 +1651,104 @@ void ParticleSystem1D::handleCollisions() {
      uint32_t idx_i = binIndices[i];
      for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
        uint32_t idx_j = binIndices[j];
-        if (advPartProps != nullptr) { // use advanced size properties
-          collisiondistance = (PS_P_MINHARDRADIUS_1D << particlesize) + ((advPartProps[idx_i].size + advPartProps[idx_j].size) >> 1);
-        }
-        int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance between particles with lookahead
-        uint32_t dx_abs = abs(dx);
-        if (dx_abs <= collisiondistance) { // collide if close
-          collideParticles(particles[idx_i], particleFlags[idx_i], particles[idx_j], particleFlags[idx_j], dx, dx_abs, collisiondistance);
+        int32_t dx = particles[idx_j].x - particles[idx_i].x; // distance between particles
+        uint32_t dx_sq = dx * dx; // square distance (faster than abs() and works the same)
+        if (dx_sq <= checkDistSq) { // possible collision imminent, check properly note: this is slower than using direct speed look-ahead (like in 2D) but more accurate and fast enough for 1D
+          collideParticles(idx_i, idx_j, dx, collisiondistance); // handle the collision
        }
      }
    }
  }
  collisionStartIdx = nextFrameStartIdx; // set the start index for the next frame
 }
-// handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
-// takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
-  int32_t dv = particle2.vx - particle1.vx;
+// handle a collision if close proximity is detected, i.e. dx smaller than 2*radius + speed look-ahead
+void WLED_O2_ATTR ParticleSystem1D::collideParticles(uint32_t partIdx1, uint32_t partIdx2, int32_t dx, uint32_t collisiondistance) {
+  int32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
+  int32_t massratio2 = 0;
+  if (perParticleSize && advPartProps != nullptr) { // use advanced size properties, calculate collision distance and mass ratio
+    collisiondistance = (PS_P_MINHARDRADIUS_1D * 2) + ((((uint32_t)advPartProps[partIdx1].size + (uint32_t)advPartProps[partIdx2].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
+    // calculate mass ratio for collision response
+    uint32_t mass1 = PS_P_RADIUS_1D + advPartProps[partIdx1].size;
+    uint32_t mass2 = PS_P_RADIUS_1D + advPartProps[partIdx2].size;
+    uint32_t totalmass = mass1 + mass2 - 2; // -2 to account for rounding
+    massratio1 = (mass2 << 8) / totalmass; // massratio 1 depends on mass of particle 2, i.e. if 2 is heavier -> higher velocity impact on 1
+    massratio2 = (mass1 << 8) / totalmass;
+  }
+  int32_t dv = (int)particles[partIdx2].vx - (int)particles[partIdx1].vx;
+  int32_t absdv = abs(dv);
  int32_t dotProduct = (dx * dv); // is always negative if moving towards each other
+  uint32_t dx_abs = abs(dx);

  if (dotProduct < 0) { // particles are moving towards each other
-    uint32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS_1D); // if particles are soft, the impulse must stay above a limit or collisions slip through
-    // Calculate new velocities after collision  note: not using dot product like in 2D as impulse is purely speed depnedent
-    #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-    int32_t impulse = ((dv * surfacehardness) + ((dv >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
-    #else // division is faster on ESP32, S2 and S3
-    int32_t impulse = (dv * surfacehardness) / 255;
-    #endif
-    particle1.vx += impulse;
-    particle2.vx -= impulse;
-
-    // if one of the particles is fixed, transfer the impulse back so it bounces
-    if (particle1flags.fixed)
-      particle2.vx = -particle1.vx;
-    else if (particle2flags.fixed)
-      particle1.vx = -particle2.vx;
-
-    if (collisionHardness < PS_P_MINSURFACEHARDNESS_1D && (SEGMENT.call & 0x07) == 0) { // if particles are soft, they become 'sticky' i.e. apply some friction
-      const uint32_t coeff = collisionHardness + (250 - PS_P_MINSURFACEHARDNESS_1D);
+    uint32_t lookaheadDistance = collisiondistance + absdv; // add look-ahead: if reaching collisiondistance in this frame, collide
+    if (dx_abs <= lookaheadDistance) {
+      // if one of the particles is fixed, invert the other particle's velocity and multiply by hardness, also set its position to the edge of the fixed particle
+      if (particleFlags[partIdx1].fixed) {
+        particles[partIdx2].vx = -(particles[partIdx2].vx * collisionHardness) / 255;
+        particles[partIdx2].x = particles[partIdx1].x + (dx < 0 ? -collisiondistance : collisiondistance); // dv < 0 means particle2.x < particle1.x
+        return;
+      }
+      else if (particleFlags[partIdx2].fixed) {
+        particles[partIdx1].vx = -(particles[partIdx1].vx * collisionHardness) / 255;
+        particles[partIdx1].x = particles[partIdx2].x + (dx < 0 ? collisiondistance : -collisiondistance);
+        return;
+      }
+      int32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS_1D); // if particles are soft, the impulse must stay above a limit or collisions slip through
+      // Calculate new velocities after collision  note: not using dot product like in 2D as impulse is purely speed depnedent
      #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-      particle1.vx = ((int32_t)particle1.vx * coeff + (((int32_t)particle1.vx >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
-      particle2.vx = ((int32_t)particle2.vx * coeff + (((int32_t)particle2.vx >> 31) & 0xFF)) >> 8;
+      int32_t impulse = (dv * surfacehardness + ((dv >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
      #else // division is faster on ESP32, S2 and S3
-      particle1.vx = ((int32_t)particle1.vx * coeff) / 255;
-      particle2.vx = ((int32_t)particle2.vx * coeff) / 255;
+      int32_t impulse = (dv * surfacehardness) / 255;
      #endif
+
+      // if particles are not the same size, use a mass ratio. mass ratio is set to 0 if particles are the same size
+      if (massratio1) {
+        int vx1 = (int)particles[partIdx1].vx + ((impulse * massratio1) >> 7); // mass ratio is in fixed point 8bit
+        int vx2 = (int)particles[partIdx2].vx - ((impulse * massratio2) >> 7);
+        // limit speeds to max speed (required as a lot of impulse can be transferred from a large to a small particle)
+        particles[partIdx1].vx = limitSpeed(vx1);
+        particles[partIdx2].vx = limitSpeed(vx2);
+      }
+      else {
+        particles[partIdx1].vx += impulse;
+        particles[partIdx2].vx -= impulse;
+      }
+
+      if (collisionHardness < PS_P_MINSURFACEHARDNESS_1D && (SEGMENT.call & 0x07) == 0) { // if particles are soft, they become 'sticky' i.e. apply some friction
+        const uint32_t coeff = collisionHardness + (250 - PS_P_MINSURFACEHARDNESS_1D);
+        #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
+        particles[partIdx1].vx = ((int32_t)particles[partIdx1].vx * coeff + (((int32_t)particles[partIdx1].vx >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
+        particles[partIdx2].vx = ((int32_t)particles[partIdx2].vx * coeff + (((int32_t)particles[partIdx2].vx >> 31) & 0xFF)) >> 8;
+        #else // division is faster on ESP32, S2 and S3
+        particles[partIdx1].vx = ((int32_t)particles[partIdx1].vx * coeff) / 255;
+        particles[partIdx2].vx = ((int32_t)particles[partIdx2].vx * coeff) / 255;
+        #endif
+      }
+    } else {
+      return; // not close enough yet
    }
  }
+  // particles have volume, push particles apart if they are too close
+  // note: like in 2D, pushing by a distance makes softer piles collapse, giving particles speed prevents that and looks nicer

-  if (dx_abs < (collisiondistance - 8) && abs(dv) < 5) { // overlapping and moving slowly
-    // particles have volume, push particles apart if they are too close
-    // behaviour is different than in 2D, we need pixel accurate stacking here, push the top particle
-    // note: like in 2D, pushing by a distance makes softer piles collapse, giving particles speed prevents that and looks nicer
-    int32_t pushamount = 1;
-    if (dx < 0)  // particle2.x < particle1.x
+  if (dx_abs < collisiondistance) { // too close, force push particles so they dont collapse
+    int32_t pushamount = 1 + ((collisiondistance - dx_abs) >> 3); // push by eighth of deviation (plus 1 to push at least a little), note: pushing too much leads to pass-throughs and more flickering
+    int32_t addspeed = 1;
+    if (dx < 0) {  // particle2.x < particle1.x
      pushamount = -pushamount;
-    particle1.vx -= pushamount;
-    particle2.vx += pushamount;
-
-    if (dx_abs < collisiondistance >> 1) { // too close, force push particles so they dont collapse
-      pushamount = 1 + ((collisiondistance - dx_abs) >> 3); // note: push amount found by experimentation
-
-      if (particle1.x < (maxX >> 1)) { // lower half, push particle with larger x in positive direction
-        if (dx < 0 && !particle1flags.fixed) {  // particle2.x < particle1.x  -> push particle 1
-          particle1.vx++;// += pushamount;
-          particle1.x += pushamount;
-        }
-        else if (!particle2flags.fixed) { // particle1.x < particle2.x  -> push particle 2
-          particle2.vx++;// += pushamount;
-          particle2.x += pushamount;
-        }
-      }
-      else { // upper half, push particle with smaller x
-        if (dx < 0 && !particle2flags.fixed) {  // particle2.x < particle1.x  -> push particle 2
-          particle2.vx--;// -= pushamount;
-          particle2.x -= pushamount;
-        }
-        else if (!particle1flags.fixed) { // particle1.x < particle2.x  -> push particle 1
-          particle1.vx--;// -= pushamount;
-          particle1.x -= pushamount;
-        }
-      }
+      addspeed = -addspeed;
+    }
+    if (absdv < 4) { // low relative speed, add speed to help with the pushing (less collapsing piles)
+      particles[partIdx1].vx -= addspeed;
+      particles[partIdx2].vx += addspeed;
+    }
+    // push only one particle to avoid oscillations
+    bool fairlyrandom = dotProduct & 0x01;
+    if (fairlyrandom) {
+      particles[partIdx1].x -= pushamount;
+    }
+    else {
+      particles[partIdx2].x += pushamount;
    }
  }
 }
@@ -1855,24 +1860,6 @@ bool initParticleSystem1D(ParticleSystem1D *&PartSys, const uint32_t requestedso
  PartSys = new (SEGENV.data) ParticleSystem1D(SEGMENT.virtualLength(), numparticles, numsources, advanced); // particle system constructor
  return true;
 }
-
-// blur a 1D buffer, sub-size blurring can be done using start and size
-// for speed, 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
-// to blur a subset of the buffer, change the size and set start to the desired starting coordinates
-void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
-{
-  CRGBW seeppart, carryover;
-  uint32_t seep = blur >> 1;
-  carryover =  BLACK;
-  for (uint32_t x = start; x < start + size; x++) {
-    seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
-    if (x > 0) {
-      colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart);
-      colorbuffer[x]   = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass
-    }
-    carryover = seeppart;
-  }
-}
 #endif // WLED_DISABLE_PARTICLESYSTEM1D

 #if !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D)) // not both disabled