@@ -108,9 +108,8 @@ size_t SparseMatrixNamed::load(
108
108
namesBuffer = new char [1LL << 30 ]; // 1 GB buffer for names
109
109
char * raw_ptr = namesBuffer;
110
110
111
- std::vector<int > counts;
112
- std::vector<dist_t > tmp_dists;
113
- tmp_dists.reserve (128LL << 20 ); // for 128M distances
111
+ // assume space for 8M objects
112
+ distances.reserve (8LL << 20 );
114
113
115
114
bool continueReading = true ;
116
115
char * place = buf;
@@ -207,7 +206,6 @@ size_t SparseMatrixNamed::load(
207
206
// if name not mapped to numerical ids
208
207
if (it->second .first == -1 ) {
209
208
ids2names.push_back (it->first );
210
- counts.push_back (0 );
211
209
it->second .first = ids2names.size () - 1 ;
212
210
}
213
211
}
@@ -220,10 +218,24 @@ size_t SparseMatrixNamed::load(
220
218
continue ;
221
219
}
222
220
223
- ++counts[i];
224
- ++counts[j];
221
+ if (distances.size () <= j) {
222
+ distances.resize (j + 1 );
223
+ }
224
+
225
+ auto & Di = distances[i];
226
+ auto & Dj = distances[j];
227
+
228
+ // extend capacity by factor 1.5 with 16 as an initial state
229
+ if (Di.capacity () == Di.size ()) {
230
+ Di.reserve (Di.capacity () == 0 ? 16 : size_t (Di.capacity () * 1.5 ));
231
+ }
232
+
233
+ if (Dj.capacity () == Dj.size ()) {
234
+ Dj.reserve (Dj.capacity () == 0 ? 16 : size_t (Dj.capacity () * 1.5 ));
235
+ }
225
236
226
- tmp_dists.emplace_back (i, j, d);
237
+ Di.emplace_back (i, j, d);
238
+ Dj.emplace_back (j, i, d);
227
239
}
228
240
229
241
// copy remaining part after consuming all the lines
@@ -236,65 +248,20 @@ size_t SparseMatrixNamed::load(
236
248
}
237
249
}
238
250
251
+ // if neccessary, sort distances in rows according to the second id
252
+ n_elements = 0 ;
239
253
240
- distances.resize (tmp_dists.size () * 2 );
241
- rows.resize (counts.size ());
242
- int cumulated = 0 ;
243
- for (size_t i = 0 ; i < rows.size (); ++i) {
244
- rows[i] = distances.data () + cumulated;
245
- cumulated += counts[i];
246
- }
247
-
248
- struct row_info {
249
- int n_filled{ 0 };
250
- int last_id{ -1 };
251
- };
252
-
253
- std::vector <row_info> rows_info (rows.size ());
254
-
255
- // second pass - put distances in the final structure
256
- for (const dist_t & dist : tmp_dists) {
257
-
258
- uint32_t i = dist.u .s .lo ;
259
- uint32_t j = dist.u .s .hi ;
260
- double d = dist.d ;
261
-
262
- rows[i][rows_info[i].n_filled ] = dist;
263
- ++rows_info[i].n_filled ;
264
- rows_info[i].last_id = j;
265
-
266
- rows[j][rows_info[j].n_filled ] = dist_t { j,i,d };
267
- ++rows_info[j].n_filled ;
268
- rows_info[j].last_id = i;
269
- }
270
-
271
- auto end = distances.data () + distances.size ();
272
- rows.push_back (end);
273
-
274
- // if neccessary, sort distances in rows according to the second id
275
- dist_t * curBegin = rows[0 ];
276
-
277
- for (size_t i = 0 ; i < rows.size () - 1 ; ++i) {
278
- std::sort (curBegin, rows[i + 1 ], [](const dist_t & a, const dist_t & b) { return a.u .ids < b.u .ids ; });
279
- auto newEnd = std::unique (curBegin, rows[i + 1 ], [](const dist_t & a, const dist_t & b) { return a.u .ids == b.u .ids ; });
254
+ for (auto & row : distances) {
255
+ std::sort (row.begin (), row.end (), [](const dist_t & a, const dist_t & b) { return (a.u .ids == b.u .ids ) ? (a.d < b.d ) : (a.u .ids < b.u .ids ); });
256
+ auto newEnd = std::unique (row.begin (), row.end (), [](const dist_t & a, const dist_t & b) { return a.u .ids == b.u .ids ; });
280
257
281
- if (rows[i] != curBegin) {
282
- newEnd = std::copy (curBegin, newEnd, rows[i]);
283
- }
284
-
285
- curBegin = rows[i + 1 ];
286
- rows[i + 1 ] = newEnd;
287
- }
258
+ row.erase (newEnd, row.end ());
288
259
289
- size_t newSize = rows. back () - rows. front ();
290
- distances. erase (distances. begin () + newSize, distances. end ());
260
+ n_elements += row. size ();
261
+ }
291
262
292
263
delete[] buf;
293
264
294
- // debug stuff
295
- // std::ofstream dbg("debug.log");
296
- // print(dbg);
297
-
298
265
return n_total_distances;
299
266
}
300
267
@@ -438,15 +405,11 @@ size_t SparseMatrixNumbered::load(
438
405
auto is_sep = [](char c) {return c == ' ,' || c == ' \t ' || c == ' \r ' || c == ' \t ' ; };
439
406
auto is_newline = [](char c) {return c == ' \r ' || c == ' \n ' ; };
440
407
441
- std::vector<int > counts;
442
-
443
- counts.reserve (8LL << 20 ); // assume space for 8M objects
408
+ // assume space for 8M objects
409
+ distances.reserve (8LL << 20 );
444
410
global2local.reserve (8LL << 20 );
445
411
local2global.reserve (8LL << 20 );
446
412
447
- std::vector<dist_t > tmp_dists;
448
- tmp_dists.reserve (128LL << 20 ); // for 128M distances
449
-
450
413
bool continueReading = true ;
451
414
char * place = buf;
452
415
@@ -553,15 +516,24 @@ size_t SparseMatrixNumbered::load(
553
516
continue ;
554
517
}
555
518
556
- // resize counts vector
557
- if (j + 1 > counts.size ()) {
558
- counts.resize (j + 1 );
519
+ if (distances.size () <= j) {
520
+ distances.resize (j + 1 );
521
+ }
522
+
523
+ auto & Di = distances[i];
524
+ auto & Dj = distances[j];
525
+
526
+ // extend capacity by factor 1.5 with 16 as an initial state
527
+ if (Di.capacity () == Di.size ()) {
528
+ Di.reserve (Di.capacity () == 0 ? 16 : size_t (Di.capacity () * 1.5 ));
559
529
}
560
530
561
- ++counts[i];
562
- ++counts[j];
531
+ if (Dj.capacity () == Dj.size ()) {
532
+ Dj.reserve (Dj.capacity () == 0 ? 16 : size_t (Dj.capacity () * 1.5 ));
533
+ }
563
534
564
- tmp_dists.emplace_back (i, j, d);
535
+ Di.emplace_back (i, j, d);
536
+ Dj.emplace_back (j, i, d);
565
537
}
566
538
567
539
// copy remaining part after consuming all the lines
@@ -574,61 +546,50 @@ size_t SparseMatrixNumbered::load(
574
546
}
575
547
}
576
548
549
+ // if neccessary, sort distances in rows according to the second id
550
+ n_elements = 0 ;
577
551
578
- distances.resize (tmp_dists.size () * 2 );
579
- rows.resize (counts.size ());
580
- int cumulated = 0 ;
581
- for (size_t i = 0 ; i < rows.size (); ++i) {
582
- rows[i] = distances.data () + cumulated;
583
- cumulated += counts[i];
584
- }
585
-
586
- struct row_info {
587
- int n_filled{ 0 };
588
- int last_id{ -1 };
589
- };
590
-
591
- std::vector <row_info> rows_info (rows.size ());
592
-
593
- // second pass - put distances in the final structure
594
- for (const dist_t & dist : tmp_dists) {
595
-
596
- uint32_t i = dist.u .s .lo ;
597
- uint32_t j = dist.u .s .hi ;
598
- double d = dist.d ;
599
-
600
- rows[i][rows_info[i].n_filled ] = dist;
601
- ++rows_info[i].n_filled ;
602
- rows_info[i].last_id = j;
552
+ for (auto & row : distances) {
553
+ std::sort (row.begin (), row.end (), [](const dist_t & a, const dist_t & b) { return (a.u .ids == b.u .ids ) ? (a.d < b.d ) : (a.u .ids < b.u .ids ); });
554
+ auto newEnd = std::unique (row.begin (), row.end (), [](const dist_t & a, const dist_t & b) { return a.u .ids == b.u .ids ; });
603
555
604
- rows[j][rows_info[j].n_filled ] = dist_t { j,i,d };
605
- ++rows_info[j].n_filled ;
606
- rows_info[j].last_id = i;
556
+ row.erase (newEnd, row.end ());
557
+ n_elements += row.size ();
607
558
}
608
559
609
- auto end = distances.data () + distances.size ();
610
- rows.push_back (end);
560
+ delete[] buf;
611
561
612
- // if neccessary, sort distances in rows according to the second id
613
- dist_t * curBegin = rows[ 0 ];
562
+ // Print distance histogram in the verbose mode
563
+ if ( Log::getInstance (Log::LEVEL_VERBOSE). isEnabled ()) {
614
564
615
- for (size_t i = 0 ; i < rows.size () - 1 ; ++i) {
616
- std::sort (curBegin, rows[i + 1 ], [](const dist_t & a, const dist_t & b) { return (a.u .ids == b.u .ids ) ? (a.d < b.d ) : (a.u .ids < b.u .ids ); });
617
- auto newEnd = std::unique (curBegin, rows[i + 1 ], [](const dist_t & a, const dist_t & b) { return a.u .ids == b.u .ids ; });
565
+ std::vector<double > histo_bounds{ 0 };
566
+ double width = 0.001 ;
618
567
619
- if (rows[i] != curBegin) {
620
- newEnd = std::copy (curBegin, newEnd, rows[i]);
568
+ while (histo_bounds.back () < 0.05 )
569
+ {
570
+ histo_bounds.push_back (histo_bounds.back () + width);
571
+ }
572
+ histo_bounds.push_back (std::numeric_limits<double >::max ());
573
+ std::vector<int > histo (histo_bounds.size ());
574
+
575
+ for (auto & row : distances) {
576
+ for (const auto & e : row) {
577
+ for (size_t i = 0 ; i < histo_bounds.size (); ++i) {
578
+ if (e.d < histo_bounds[i]) {
579
+ ++histo[i];
580
+ break ;
581
+ }
582
+ }
583
+ }
621
584
}
622
585
623
- curBegin = rows[i + 1 ];
624
- rows[i + 1 ] = newEnd;
586
+ LOG_VERBOSE << endl << " Distance histogram" << endl;
587
+ for (size_t i = 0 ; i < histo_bounds.size (); ++i) {
588
+ LOG_VERBOSE << " d < " << histo_bounds[i] << " : " << histo[i] << endl;
589
+ }
590
+ LOG_VERBOSE << endl;
625
591
}
626
592
627
- size_t newSize = rows.back () - rows.front ();
628
- distances.erase (distances.begin () + newSize, distances.end ());
629
-
630
- delete[] buf;
631
-
632
593
return n_total_distances;
633
594
}
634
595
@@ -760,7 +721,7 @@ void SparseMatrixNamed::print(std::ostream& out) {
760
721
for (auto name : names) {
761
722
762
723
int i = names2ids[name].first ;
763
- std::vector<dist_t > row (rows[i], rows[i + 1 ]) ;
724
+ std::vector<dist_t >& row = distances[i] ;
764
725
765
726
std::sort (row.begin (), row.end (), [this ](const dist_t & a, const dist_t & b) {
766
727
return strcmp (ids2names[a.u .s .hi ], ids2names[b.u .s .hi ]) < 0 ;
0 commit comments