parallelize mesh fixup (#1148)

pca006132 · web-flow · commit f178cd1d732a · 2025-02-21T08:21:47.000+08:00
* parallelize mesh fixup

* avoid quadratic behavior

* documentation

* allocation optimization

* format

* fix gcc warning

* reduce temporary allocations in std::function

* rename

* format

* split DedupeEdges into its own function

* avoid undefined behavior
diff --git a/src/edge_op.cpp b/src/edge_op.cpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <unordered_set>
+
 #include "./impl.h"
 #include "./parallel.h"
 
@@ -214,77 +216,7 @@ void Manifold::Impl::CleanupTopology() {
   // In the case of a very bad triangulation, it is possible to create pinched
   // verts. They must be removed before edge collapse.
   SplitPinchedVerts();
-
-  Vec<int> entries;
-  FlagStore s;
-  while (1) {
-    ZoneScopedN("DedupeEdge");
-
-    const size_t nbEdges = halfedge_.size();
-    size_t numFlagged = 0;
-
-#if MANIFOLD_PAR == 1
-    if (nbEdges > 1e5) {
-      // Note that this is slightly different from the single thread version
-      // because we store all indices instead of just indices of forward
-      // halfedges. Backward halfedges are placed at the end by modifying the
-      // comparison function.
-      entries.resize_nofill(nbEdges);
-      sequence(entries.begin(), entries.end());
-      stable_sort(entries.begin(), entries.end(), [&](int a, int b) {
-        const auto& self = halfedge_[a];
-        const auto& other = halfedge_[b];
-        // place all backward edges at the end
-        if (!self.IsForward()) return false;
-        if (!other.IsForward()) return true;
-        // dictionary order based on start and end vertices
-        return self.startVert == other.startVert
-                   ? self.endVert < other.endVert
-                   : self.startVert < other.startVert;
-      });
-      entries.resize(nbEdges / 2);
-    } else
-#endif
-    {
-      entries.clear(true);
-      entries.reserve(nbEdges / 2);
-      for (size_t i = 0; i < nbEdges; ++i) {
-        if (halfedge_[i].IsForward()) {
-          entries.push_back(i);
-        }
-      }
-      stable_sort(entries.begin(), entries.end(), [&](int a, int b) {
-        const auto& self = halfedge_[a];
-        const auto& other = halfedge_[b];
-        // dictionary order based on start and end vertices
-        return self.startVert == other.startVert
-                   ? self.endVert < other.endVert
-                   : self.startVert < other.startVert;
-      });
-    }
-
-    s.run(
-        entries.size() - 1,
-        [&](int i) {
-          const int h0 = entries[i];
-          const int h1 = entries[i + 1];
-          return (halfedge_[h0].startVert == halfedge_[h1].startVert &&
-                  halfedge_[h0].endVert == halfedge_[h1].endVert);
-        },
-        [&](int i) {
-          DedupeEdge(entries[i]);
-          numFlagged++;
-        });
-
-    if (numFlagged == 0) break;
-
-#ifdef MANIFOLD_DEBUG
-    if (ManifoldParams().verbose) {
-      std::cout << "found " << numFlagged << " duplicate edges to split"
-                << std::endl;
-    }
-#endif
-  }
+  DedupeEdges();
 }
 
 /**
@@ -782,22 +714,194 @@ void Manifold::Impl::RecursiveEdgeSwap(const int edge, int& tag,
 
 void Manifold::Impl::SplitPinchedVerts() {
   ZoneScoped;
-  std::vector<bool> vertProcessed(NumVert(), false);
-  std::vector<bool> halfedgeProcessed(halfedge_.size(), false);
-  for (size_t i = 0; i < halfedge_.size(); ++i) {
-    if (halfedgeProcessed[i]) continue;
-    int vert = halfedge_[i].startVert;
-    if (vertProcessed[vert]) {
-      vertPos_.push_back(vertPos_[vert]);
-      vert = NumVert() - 1;
-    } else {
-      vertProcessed[vert] = true;
+
+  auto nbEdges = halfedge_.size();
+#if MANIFOLD_PAR == 1
+  if (nbEdges > 1e4) {
+    std::mutex mutex;
+    std::vector<size_t> pinched;
+    // This parallelized version is non-trivial so we can't reuse the code
+    //
+    // The idea here is to identify cycles of halfedges that can be iterated
+    // through using ForVert. Pinched verts are vertices where there are
+    // multiple cycles associated with the vertex. Each cycle is identified with
+    // the largest halfedge index within the cycle, and when there are multiple
+    // cycles associated with the same starting vertex but with different ids,
+    // it means we have a pinched vertex. This check is done by using a single
+    // atomic cas operation, the expected case is either invalid id (the vertex
+    // was not processed) or with the same id.
+    //
+    // The local store is to store the processed halfedges, so to avoid
+    // repetitive processing. Note that it only approximates the processed
+    // halfedges because it is thread local. This is why we need a vector to
+    // deduplicate the probematic halfedges we found.
+    std::vector<std::atomic<size_t>> largestEdge(NumVert());
+    for_each(ExecutionPolicy::Par, countAt(0), countAt(NumVert()),
+             [&largestEdge](size_t i) {
+               largestEdge[i].store(std::numeric_limits<size_t>::max());
+             });
+    tbb::combinable<std::vector<bool>> store(
+        [nbEdges]() { return std::vector<bool>(nbEdges, false); });
+    tbb::parallel_for(
+        tbb::blocked_range<size_t>(0, nbEdges),
+        [&store, &mutex, &pinched, &largestEdge, this](const auto& r) {
+          auto& local = store.local();
+          std::vector<size_t> pinchedLocal;
+          for (auto i = r.begin(); i < r.end(); ++i) {
+            if (local[i]) continue;
+            local[i] = true;
+            const int vert = halfedge_[i].startVert;
+            size_t largest = i;
+            ForVert(i, [&local, &largest](int current) {
+              local[current] = true;
+              largest = std::max(largest, static_cast<size_t>(current));
+            });
+            size_t expected = std::numeric_limits<size_t>::max();
+            if (!largestEdge[vert].compare_exchange_strong(expected, largest) &&
+                expected != largest) {
+              // we know that there is another loop...
+              pinchedLocal.push_back(largest);
+            }
+          }
+          if (!pinchedLocal.empty()) {
+            std::lock_guard<std::mutex> lock(mutex);
+            pinched.insert(pinched.end(), pinchedLocal.begin(),
+                           pinchedLocal.end());
+          }
+        });
+
+    std::vector<bool> halfedgeProcessed(nbEdges, false);
+    for (size_t i : pinched) {
+      if (halfedgeProcessed[i]) continue;
+      vertPos_.push_back(vertPos_[halfedge_[i].startVert]);
+      const int vert = NumVert() - 1;
+      ForVert(i, [this, vert, &halfedgeProcessed](int current) {
+        halfedgeProcessed[current] = true;
+        halfedge_[current].startVert = vert;
+        halfedge_[halfedge_[current].pairedHalfedge].endVert = vert;
+      });
     }
-    ForVert(i, [this, &halfedgeProcessed, vert](int current) {
-      halfedgeProcessed[current] = true;
-      halfedge_[current].startVert = vert;
-      halfedge_[halfedge_[current].pairedHalfedge].endVert = vert;
-    });
+  } else
+#endif
+  {
+    std::vector<bool> vertProcessed(NumVert(), false);
+    std::vector<bool> halfedgeProcessed(nbEdges, false);
+    for (size_t i = 0; i < nbEdges; ++i) {
+      if (halfedgeProcessed[i]) continue;
+      int vert = halfedge_[i].startVert;
+      if (vertProcessed[vert]) {
+        vertPos_.push_back(vertPos_[vert]);
+        vert = NumVert() - 1;
+      } else {
+        vertProcessed[vert] = true;
+      }
+      ForVert(i, [this, &halfedgeProcessed, vert](int current) {
+        halfedgeProcessed[current] = true;
+        halfedge_[current].startVert = vert;
+        halfedge_[halfedge_[current].pairedHalfedge].endVert = vert;
+      });
+    }
+  }
+}
+
+void Manifold::Impl::DedupeEdges() {
+  while (1) {
+    ZoneScopedN("DedupeEdge");
+
+    const size_t nbEdges = halfedge_.size();
+    std::vector<size_t> duplicates;
+    auto localLoop = [&](size_t start, size_t end, std::vector<bool>& local,
+                         std::vector<size_t>& results) {
+      // Iterate over all halfedges that start with the same vertex, and check
+      // for halfedges with the same ending vertex.
+      // Note: we use Vec and linear search when the number of neighbor is
+      // small because unordered_set requires allocations and is expensive.
+      // We switch to unordered_set when the number of neighbor is
+      // larger to avoid making things quadratic.
+      //
+      // The local store is to store the processed halfedges, so to avoid
+      // repetitive processing. Note that it only approximates the processed
+      // halfedges because it is thread local. This is why we need a vector to
+      // deduplicate the probematic halfedges we found.
+      Vec<int> endVerts;
+      std::unordered_set<int> endVertSet;
+      for (auto i = start; i < end; ++i) {
+        if (local[i] || halfedge_[i].startVert == -1 ||
+            halfedge_[i].endVert == -1)
+          continue;
+        local[i] = true;
+        // we want to keep the allocation
+        endVerts.clear(false);
+        endVertSet.clear();
+        ForVert(
+            i, [&local, &endVerts, &endVertSet, &results, this](int current) {
+              local[current] = true;
+              if (halfedge_[current].startVert == -1 ||
+                  halfedge_[current].endVert == -1) {
+                return;
+              }
+              if (endVertSet.empty()) {
+                if (std::find(endVerts.begin(), endVerts.end(),
+                              halfedge_[current].endVert) != endVerts.end()) {
+                  results.push_back(current);
+                } else {
+                  endVerts.push_back(halfedge_[current].endVert);
+                  // switch to hashset for vertices with many neighbors
+                  if (endVerts.size() > 32) {
+                    endVertSet.insert(endVerts.begin(), endVerts.end());
+                    endVerts.clear(false);
+                  }
+                }
+              } else {
+                if (!endVertSet.insert(halfedge_[current].endVert).second) {
+                  results.push_back(current);
+                }
+              }
+            });
+      }
+    };
+#if MANIFOLD_PAR == 1
+    if (nbEdges > 1e4) {
+      std::mutex mutex;
+      tbb::combinable<std::vector<bool>> store(
+          [nbEdges]() { return std::vector<bool>(nbEdges, false); });
+      tbb::parallel_for(
+          tbb::blocked_range<size_t>(0, nbEdges),
+          [&store, &mutex, &duplicates, this, &localLoop](const auto& r) {
+            auto& local = store.local();
+            std::vector<size_t> duplicatesLocal;
+            localLoop(r.begin(), r.end(), local, duplicatesLocal);
+            if (!duplicatesLocal.empty()) {
+              std::lock_guard<std::mutex> lock(mutex);
+              duplicates.insert(duplicates.end(), duplicatesLocal.begin(),
+                                duplicatesLocal.end());
+            }
+          });
+    } else
+#endif
+    {
+      std::vector<bool> local(nbEdges, false);
+      localLoop(0, nbEdges, local, duplicates);
+    }
+
+    size_t numFlagged = 0;
+    // there may be duplicates
+    std::vector<bool> handled(nbEdges, false);
+    for (size_t i : duplicates) {
+      if (handled[i]) continue;
+      DedupeEdge(i);
+      numFlagged++;
+      handled[i] = true;
+    }
+
+    if (numFlagged == 0) break;
+
+#ifdef MANIFOLD_DEBUG
+    if (ManifoldParams().verbose) {
+      std::cout << "found " << numFlagged << " duplicate edges to split"
+                << std::endl;
+    }
+#endif
   }
 }
 }  // namespace manifold
diff --git a/src/impl.cpp b/src/impl.cpp
@@ -223,14 +223,17 @@ void Manifold::Impl::CreateFaces() {
   stable_sort(triPriority.begin(), triPriority.end(),
               [](auto a, auto b) { return a.area2 > b.area2; });
 
+  Vec<int> interiorHalfedges;
   for (const auto tp : triPriority) {
     if (meshRelation_.triRef[tp.tri].faceID >= 0) continue;
 
     meshRelation_.triRef[tp.tri].faceID = tp.tri;
     const vec3 base = vertPos_[halfedge_[3 * tp.tri].startVert];
     const vec3 normal = faceNormal_[tp.tri];
-    std::vector<int> interiorHalfedges = {3 * tp.tri, 3 * tp.tri + 1,
-                                          3 * tp.tri + 2};
+    interiorHalfedges.resize(3);
+    interiorHalfedges[0] = 3 * tp.tri;
+    interiorHalfedges[1] = 3 * tp.tri + 1;
+    interiorHalfedges[2] = 3 * tp.tri + 2;
     while (!interiorHalfedges.empty()) {
       const int h =
           NextHalfedge(halfedge_[interiorHalfedges.back()].pairedHalfedge);
diff --git a/src/impl.h b/src/impl.h
@@ -188,6 +188,8 @@ struct Manifold::Impl {
 
     Vec<ivec3> triVerts;
     triVerts.reserve(numTri);
+    if (triRef.size() > 0) meshRelation_.triRef.reserve(numTri);
+    if (numProp > 0) meshRelation_.triProperties.reserve(numTri);
     for (size_t i = 0; i < numTri; ++i) {
       ivec3 tri;
       for (const size_t j : {0, 1, 2}) {
@@ -244,7 +246,8 @@ struct Manifold::Impl {
     meshRelation_.originalID = -1;
   }
 
-  inline void ForVert(int halfedge, std::function<void(int halfedge)> func) {
+  template <typename F>
+  inline void ForVert(int halfedge, F func) {
     int current = halfedge;
     do {
       current = NextHalfedge(halfedge_[current].pairedHalfedge);
@@ -341,6 +344,7 @@ struct Manifold::Impl {
   void FormLoop(int current, int end);
   void CollapseTri(const ivec3& triEdge);
   void SplitPinchedVerts();
+  void DedupeEdges();
 
   // subdivision.cpp
   int GetNeighbor(int tri) const;
diff --git a/src/vec.h b/src/vec.h
@@ -176,7 +176,7 @@ class Vec : public VecView<T> {
   }
 
   void resize(size_t newSize, T val = T()) {
-    bool shrink = this->size_ > 2 * newSize;
+    bool shrink = this->size_ > 2 * newSize && this->size_ > 16;
     reserve(newSize);
     if (this->size_ < newSize) {
       fill(autoPolicy(newSize - this->size_), this->ptr_ + this->size_,
@@ -187,13 +187,13 @@ class Vec : public VecView<T> {
   }
 
   void resize_nofill(size_t newSize) {
-    bool shrink = this->size_ > 2 * newSize;
+    bool shrink = this->size_ > 2 * newSize && this->size_ > 16;
     reserve(newSize);
     this->size_ = newSize;
     if (shrink) shrink_to_fit();
   }
 
-  void pop_back() { resize(this->size_ - 1); }
+  void pop_back() { resize_nofill(this->size_ - 1); }
 
   void clear(bool shrink = true) {
     this->size_ = 0;