Skip to content

Commit 5ed7787

Browse files
authored
Fixed crash on large input datasets.
1 parent 692aac2 commit 5ed7787

9 files changed

+176
-177
lines changed

src/console.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ bool Console::parse(int argc, char** argv) {
102102
findOption(args, PARAM_LEIDEN_BETA, leidenParams.beta);
103103
findOption(args, PARAM_LEIDEN_ITERATIONS, leidenParams.numIterations);
104104

105+
verbose = findSwitch(args, FLAG_VERBOSE);
106+
105107
if (args.size() == 2) {
106108
distancesFile = args[0];
107109
output = args[1];

src/console.h

+20-4
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,11 @@ class Console {
5252
const std::string PARAM_LEIDEN_BETA{"--leiden-beta"};
5353
const std::string PARAM_LEIDEN_ITERATIONS{"--leiden-iterations"};
5454

55-
56-
Algo str2algo(const std::string& str)
57-
{
55+
const std::string FLAG_VERBOSE{ "-v" };
56+
57+
public:
58+
static Algo str2algo(const std::string& str)
59+
{
5860
if (str == "single") { return Algo::SingleLinkage; }
5961
else if (str == "complete") { return Algo::CompleteLinkage; }
6062
else if (str == "uclust") { return Algo::UClust; }
@@ -65,6 +67,18 @@ class Console {
6567
else { throw std::runtime_error("Unkown clustering algorithm"); }
6668
}
6769

70+
static std::string algo2str(Algo algo) {
71+
switch (algo) {
72+
case Algo::SingleLinkage: return "single";
73+
case Algo::CompleteLinkage: return "complete";
74+
case Algo::UClust: return "uclust";
75+
case Algo::SetCover: return "set-cover";
76+
case Algo::Leiden: return "leiden";
77+
case Algo::CdHit: return "cd-hit";
78+
default: throw std::runtime_error("Unkown clustering algorithm");
79+
}
80+
}
81+
6882

6983
public:
7084

@@ -85,7 +99,9 @@ class Console {
8599
bool outputCSV{ false };
86100

87101
LeidenParams leidenParams;
88-
102+
103+
bool verbose{ false };
104+
89105
void printUsage() const;
90106
bool parse(int argc, char** argv);
91107

src/distances.cpp

+78-117
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,8 @@ size_t SparseMatrixNamed::load(
108108
namesBuffer = new char[1LL << 30]; // 1 GB buffer for names
109109
char* raw_ptr = namesBuffer;
110110

111-
std::vector<int> counts;
112-
std::vector<dist_t> tmp_dists;
113-
tmp_dists.reserve(128LL << 20); // for 128M distances
111+
// assume space for 8M objects
112+
distances.reserve(8LL << 20);
114113

115114
bool continueReading = true;
116115
char* place = buf;
@@ -207,7 +206,6 @@ size_t SparseMatrixNamed::load(
207206
// if name not mapped to numerical ids
208207
if (it->second.first == -1) {
209208
ids2names.push_back(it->first);
210-
counts.push_back(0);
211209
it->second.first = ids2names.size() - 1;
212210
}
213211
}
@@ -220,10 +218,24 @@ size_t SparseMatrixNamed::load(
220218
continue;
221219
}
222220

223-
++counts[i];
224-
++counts[j];
221+
if (distances.size() <= j) {
222+
distances.resize(j + 1);
223+
}
224+
225+
auto& Di = distances[i];
226+
auto& Dj = distances[j];
227+
228+
// extend capacity by factor 1.5 with 16 as an initial state
229+
if (Di.capacity() == Di.size()) {
230+
Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5));
231+
}
232+
233+
if (Dj.capacity() == Dj.size()) {
234+
Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5));
235+
}
225236

226-
tmp_dists.emplace_back(i, j, d);
237+
Di.emplace_back(i, j, d);
238+
Dj.emplace_back(j, i, d);
227239
}
228240

229241
// copy remaining part after consuming all the lines
@@ -236,65 +248,20 @@ size_t SparseMatrixNamed::load(
236248
}
237249
}
238250

251+
// if neccessary, sort distances in rows according to the second id
252+
n_elements = 0;
239253

240-
distances.resize(tmp_dists.size() * 2);
241-
rows.resize(counts.size());
242-
int cumulated = 0;
243-
for (size_t i = 0; i < rows.size(); ++i) {
244-
rows[i] = distances.data() + cumulated;
245-
cumulated += counts[i];
246-
}
247-
248-
struct row_info {
249-
int n_filled{ 0 };
250-
int last_id{ -1 };
251-
};
252-
253-
std::vector <row_info> rows_info(rows.size());
254-
255-
// second pass - put distances in the final structure
256-
for (const dist_t& dist : tmp_dists) {
257-
258-
uint32_t i = dist.u.s.lo;
259-
uint32_t j = dist.u.s.hi;
260-
double d = dist.d;
261-
262-
rows[i][rows_info[i].n_filled] = dist;
263-
++rows_info[i].n_filled;
264-
rows_info[i].last_id = j;
265-
266-
rows[j][rows_info[j].n_filled] = dist_t{ j,i,d };
267-
++rows_info[j].n_filled;
268-
rows_info[j].last_id = i;
269-
}
270-
271-
auto end = distances.data() + distances.size();
272-
rows.push_back(end);
273-
274-
// if neccessary, sort distances in rows according to the second id
275-
dist_t* curBegin = rows[0];
276-
277-
for (size_t i = 0; i < rows.size() - 1; ++i) {
278-
std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids < b.u.ids; });
279-
auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
254+
for (auto& row : distances) {
255+
std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
256+
auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
280257

281-
if (rows[i] != curBegin) {
282-
newEnd = std::copy(curBegin, newEnd, rows[i]);
283-
}
284-
285-
curBegin = rows[i + 1];
286-
rows[i + 1] = newEnd;
287-
}
258+
row.erase(newEnd, row.end());
288259

289-
size_t newSize = rows.back() - rows.front();
290-
distances.erase(distances.begin() + newSize, distances.end());
260+
n_elements += row.size();
261+
}
291262

292263
delete[] buf;
293264

294-
// debug stuff
295-
//std::ofstream dbg("debug.log");
296-
//print(dbg);
297-
298265
return n_total_distances;
299266
}
300267

@@ -438,15 +405,11 @@ size_t SparseMatrixNumbered::load(
438405
auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; };
439406
auto is_newline = [](char c) {return c == '\r' || c == '\n'; };
440407

441-
std::vector<int> counts;
442-
443-
counts.reserve(8LL << 20); // assume space for 8M objects
408+
// assume space for 8M objects
409+
distances.reserve(8LL << 20);
444410
global2local.reserve(8LL << 20);
445411
local2global.reserve(8LL << 20);
446412

447-
std::vector<dist_t> tmp_dists;
448-
tmp_dists.reserve(128LL << 20); // for 128M distances
449-
450413
bool continueReading = true;
451414
char* place = buf;
452415

@@ -553,15 +516,24 @@ size_t SparseMatrixNumbered::load(
553516
continue;
554517
}
555518

556-
// resize counts vector
557-
if (j + 1 > counts.size()) {
558-
counts.resize(j + 1);
519+
if (distances.size() <= j) {
520+
distances.resize(j + 1);
521+
}
522+
523+
auto& Di = distances[i];
524+
auto& Dj = distances[j];
525+
526+
// extend capacity by factor 1.5 with 16 as an initial state
527+
if (Di.capacity() == Di.size()) {
528+
Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5));
559529
}
560530

561-
++counts[i];
562-
++counts[j];
531+
if (Dj.capacity() == Dj.size()) {
532+
Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5));
533+
}
563534

564-
tmp_dists.emplace_back(i, j, d);
535+
Di.emplace_back(i, j, d);
536+
Dj.emplace_back(j, i, d);
565537
}
566538

567539
// copy remaining part after consuming all the lines
@@ -574,61 +546,50 @@ size_t SparseMatrixNumbered::load(
574546
}
575547
}
576548

549+
// if neccessary, sort distances in rows according to the second id
550+
n_elements = 0;
577551

578-
distances.resize(tmp_dists.size() * 2);
579-
rows.resize(counts.size());
580-
int cumulated = 0;
581-
for (size_t i = 0; i < rows.size(); ++i) {
582-
rows[i] = distances.data() + cumulated;
583-
cumulated += counts[i];
584-
}
585-
586-
struct row_info {
587-
int n_filled{ 0 };
588-
int last_id{ -1 };
589-
};
590-
591-
std::vector <row_info> rows_info(rows.size());
592-
593-
// second pass - put distances in the final structure
594-
for (const dist_t& dist : tmp_dists) {
595-
596-
uint32_t i = dist.u.s.lo;
597-
uint32_t j = dist.u.s.hi;
598-
double d = dist.d;
599-
600-
rows[i][rows_info[i].n_filled] = dist;
601-
++rows_info[i].n_filled;
602-
rows_info[i].last_id = j;
552+
for (auto& row : distances) {
553+
std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
554+
auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
603555

604-
rows[j][rows_info[j].n_filled] = dist_t{ j,i,d };
605-
++rows_info[j].n_filled;
606-
rows_info[j].last_id = i;
556+
row.erase(newEnd, row.end());
557+
n_elements += row.size();
607558
}
608559

609-
auto end = distances.data() + distances.size();
610-
rows.push_back(end);
560+
delete[] buf;
611561

612-
// if neccessary, sort distances in rows according to the second id
613-
dist_t* curBegin = rows[0];
562+
// Print distance histogram in the verbose mode
563+
if (Log::getInstance(Log::LEVEL_VERBOSE).isEnabled()) {
614564

615-
for (size_t i = 0; i < rows.size() - 1; ++i) {
616-
std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
617-
auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
565+
std::vector<double> histo_bounds{ 0 };
566+
double width = 0.001;
618567

619-
if (rows[i] != curBegin) {
620-
newEnd = std::copy(curBegin, newEnd, rows[i]);
568+
while (histo_bounds.back() < 0.05)
569+
{
570+
histo_bounds.push_back(histo_bounds.back() + width);
571+
}
572+
histo_bounds.push_back(std::numeric_limits<double>::max());
573+
std::vector<int> histo(histo_bounds.size());
574+
575+
for (auto& row : distances) {
576+
for (const auto& e : row) {
577+
for (size_t i = 0; i < histo_bounds.size(); ++i) {
578+
if (e.d < histo_bounds[i]) {
579+
++histo[i];
580+
break;
581+
}
582+
}
583+
}
621584
}
622585

623-
curBegin = rows[i + 1];
624-
rows[i + 1] = newEnd;
586+
LOG_VERBOSE << endl << "Distance histogram" << endl;
587+
for (size_t i = 0; i < histo_bounds.size(); ++i) {
588+
LOG_VERBOSE << " d < " << histo_bounds[i] << ": " << histo[i] << endl;
589+
}
590+
LOG_VERBOSE << endl;
625591
}
626592

627-
size_t newSize = rows.back() - rows.front();
628-
distances.erase(distances.begin() + newSize, distances.end());
629-
630-
delete[] buf;
631-
632593
return n_total_distances;
633594
}
634595

@@ -760,7 +721,7 @@ void SparseMatrixNamed::print(std::ostream& out) {
760721
for (auto name : names) {
761722

762723
int i = names2ids[name].first;
763-
std::vector<dist_t> row(rows[i], rows[i + 1]);
724+
std::vector<dist_t>& row = distances[i];
764725

765726
std::sort(row.begin(), row.end(), [this](const dist_t& a, const dist_t& b) {
766727
return strcmp(ids2names[a.u.s.hi], ids2names[b.u.s.hi]) < 0;

0 commit comments

Comments
 (0)