Skip to content

Commit

Permalink
ft: ARSN-388 implement GapSet (caching of listing gaps)
Browse files Browse the repository at this point in the history
The GapSet class is intended for caching listing "gaps", which are
contiguous streaks of deleted objects in buckets, although the
semantics can allow for other uses in the future.

The end goal is to drastically increase the performance of listings on
V0 buckets when a lot of delete markers are present, as a temporary
solution until buckets are migrated to V1 format.
  • Loading branch information
jonathan-gramain committed Jan 27, 2024
1 parent 918c2c5 commit 0066a29
Show file tree
Hide file tree
Showing 5 changed files with 1,156 additions and 0 deletions.
1 change: 1 addition & 0 deletions index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export const algorithms = {
DelimiterTools: require('./lib/algos/list/tools'),
},
cache: {
GapSet: require('./lib/algos/cache/GapSet'),
LRUCache: require('./lib/algos/cache/LRUCache'),
},
stream: {
Expand Down
352 changes: 352 additions & 0 deletions lib/algos/cache/GapSet.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,352 @@
import assert from 'assert';
import SortedSet from 'collections/sorted-set';

import errors from '../../errors';

export type GapSetEntry = {
firstKey: string,
lastKey: string,
weight: number,
};

/**
* Specialized data structure to support caching of listing "gaps",
* i.e. ranges of keys that can be skipped over during listing
* (because they only contain delete markers as latest versions)
*/
export default class GapSet implements Iterable<GapSetEntry> {
_gaps: SortedSet;
_maxWeight: number;

/**
* @constructor
* @param {number} maxWeight - weight threshold for each cached
* gap (unitless). Triggers splitting gaps when reached
*/
constructor(maxWeight: number) {
this._gaps = new SortedSet(
[],
(left, right) => left.firstKey === right.firstKey,
(left, right) => (
left.firstKey < right.firstKey ? -1 :
left.firstKey > right.firstKey ? 1 : 0
)
);
this._maxWeight = maxWeight;
}

/**
* Create a GapSet from an array of gap entries (used in tests)
*/
static createFromArray(gaps: GapSetEntry[], maxWeight: number): GapSet {
const gapSet = new GapSet(maxWeight);
gapSet._gaps.addEach(gaps);
return gapSet;
}

/**
* Helper function for setGap() that initializes a new gap or
* retrieves an existing one depending on boundaries and weight
*
* @param {string} firstKey - first key of the gap
* @param {string} lastKey - last key of the gap
* @param {number} weight - total weight between 'firstKey' and 'lastKey'
* @return {object} - { gap: GapSetEntry, weightToMerge: number }
* - gap: created or existing gap entry, to be possibly extended and merged
* - weightToMerge: how much weight there is remaining to merge with the returned gap
*/
_lookupOrCreateGap(firstKey: string, lastKey: string, weight: number): {
gap: GapSetEntry,
weightToMerge: number,
} {
let weightToMerge;
let gap = this._gaps.findGreatestLessThanOrEqual({ firstKey })?.value;
if (gap && gap.lastKey < firstKey) {
// closest gap found is not overlapping, ignore it
gap = null;
}
if (gap) {
if (gap.lastKey >= lastKey) {
// return fully overlapping gap already cached
return { gap, weightToMerge: 0 };
}
if (gap.lastKey === firstKey) {
// new gap is chained with last gap: merge the full weight value
weightToMerge = weight;
} else if (gap.firstKey === firstKey) {
// last gap is fully contained in the new gap: remove the
// last gap's weight from the weight to merge
weightToMerge = Math.max(weight - gap.weight, 0);
} else {
// last gap is not fully contained nor chained: default to returning
// the new weight (heuristic)
weightToMerge = weight;
}
if (weight > this._maxWeight) {
// if the new gap's weight exceeds the max, split the existing gap found in
// two chained gaps, by using the same key as a link (this is important
// to ensure that lookupGap() can return them as a single contiguous gap)
gap = {
firstKey: gap.lastKey,
// split gap is temporarily single-key, will be extended by setGap()
lastKey: gap.lastKey,
// split gap only contains the weight not already included in the previous gap
//weight: Math.max(weight - gap.weight, 0),
weight: 0,
};
// there may be an existing gap starting with 'lastKey': delete it first
this._gaps.delete(gap);
// then add the new split gap to be extended thereafter
this._gaps.add(gap);
return { gap, weightToMerge };
}
} else {
// create a new single-key gap that will be extended to 'lastKey' in setGap()
// during the merge process
gap = {
firstKey,
lastKey: firstKey,
weight: 0,
};
this._gaps.add(gap);
weightToMerge = weight;
}
return { gap, weightToMerge };
}

/**
* Record a gap between two keys, associated with a weight to limit
* individual gap sizes in the cache.
*
* The function handles splitting and merging existing gaps to
* maintain an optimal weight of cache entries.
*
* @param {string} firstKey - first key of the gap
* @param {string} lastKey - last key of the gap, must be greater
* or equal than 'firstKey'
* @param {number} weight - total weight between 'firstKey' and 'lastKey'
* @return {GapSetEntry} - existing or new gap entry
*/
setGap(firstKey: string, lastKey: string, weight: number): GapSetEntry {
assert(lastKey >= firstKey);

const lookup = this._lookupOrCreateGap(firstKey, lastKey, weight);
let { gap: curGap, weightToMerge } = lookup;
let findNextGap;
if (curGap.firstKey === curGap.lastKey) {
// single-key gaps are never chained: find the first gap strictly after 'curGap'
// Note: using the 'OrEqual' method is not possible as it would yield 'curGap'
findNextGap = this._gaps.findLeastGreaterThan.bind(this._gaps);
} else {
// gaps with more than one key may be chained (g1.lastKey == g2.firstKey) and we
// need the 'OrEqual' method to find those chained gaps
findNextGap = this._gaps.findLeastGreaterThanOrEqual.bind(this._gaps);
}
let mergedWeightSum = 0;
// loop over existing mergeable gaps straddled by 'gap.lastKey' -> 'lastKey'
while (lastKey > curGap.lastKey) {
const nextGap = findNextGap({ firstKey: curGap.lastKey })?.value;
// if no more gap or if the next gap starts beyond 'lastKey', stop merging
if (!nextGap || nextGap.firstKey > lastKey) {
// extend the existing gap
curGap.lastKey = lastKey;
break;
}
if (nextGap.firstKey === lastKey) {
// in this particular case the next gap is chained
// after the new gap, hence we add the full amount of
// the weight to merge as it doesn't overlap with the
// next gap
mergedWeightSum += weightToMerge;
}
// merge 'nextGap' into 'curGap'
curGap.lastKey = nextGap.lastKey;
this._gaps.delete(nextGap);
// keep track of the sum of weights for the merged gaps
mergedWeightSum += nextGap.weight;
// after the first iteration, always use the 'OrEqual' method to find
// chained ranges
findNextGap = this._gaps.findLeastGreaterThanOrEqual.bind(this._gaps);
}
// The new gap weight is set accurately whenever the new gap is found to be
// either chained to other gaps without overlap, or fully overlapping with
// existing gaps. When it is not the case, a heuristic is used that keeps the
// maximum between the new gap weight and the sum of overlapping gap weights,
// which has the property of retaining the highest relative weight.
curGap.weight += Math.max(weightToMerge, mergedWeightSum);
return Object.assign({}, curGap);
}

/**
* Remove gaps that overlap with a given set of keys. Used to
* invalidate gaps when keys are inserted or deleted.
*
* @param {string[]} overlappingKeys - remove gaps that overlap
* with any of this set of keys
* @return {number} - how many gaps were removed
*/
removeOverlappingGaps(overlappingKeys: string[]): number {
// Optimize the common case where a committed batch consists of all keys being
// close to each other, thus most of the time:
// - there is no gap to remove
// - or all keys belong to the same gap to remove
//
// The optimization consists of looping until there is no remaining key to check,
// and at each iteration:
// - pick the min and max key (via a linear iteration on remaining keys)
// - find the closest gap before the max key
// -> remove the gap if it overlaps any key
// - return if there is no more key that may overlap any other range (the min key
// allows for a quick check for the common case)
// -> otherwise, loop back with the remaining keys that may overlap with other ranges

let remainingKeys = overlappingKeys;
let nRemoved = 0;
while (remainingKeys.length > 0) {
let minKey = remainingKeys[0];
let maxKey = remainingKeys[0];
for (const key of remainingKeys) {
if (key < minKey) {
minKey = key;
} else if (key > maxKey) {
maxKey = key;
}
}
let closestGap = this._gaps.findGreatestLessThanOrEqual({ firstKey: maxKey })?.value;
// consecutive gaps may overlap by a unique key, in which
// case remove that gap and lookup the previous one
if (closestGap?.firstKey === maxKey) {
this._gaps.delete(closestGap);
nRemoved += 1;
closestGap = this._gaps.findGreatestLessThan({ firstKey: maxKey })?.value;
}
if (!closestGap || closestGap.lastKey < minKey) {
// we're done because no more gap overlaps any of the remaining keys
return nRemoved;
}
const { firstKey, lastKey } = closestGap;
if (firstKey < minKey && lastKey >= maxKey) {
// we're done because the gap covers all keys, just remove it and return
// (in case firstKey === minKey there may be an extra chained gap to remove)
this._gaps.delete(closestGap);
return nRemoved + 1;
}
// build the new remaining keys array with only keys that may still cover an
// existing gap left of the closest gap, and on the way, remove the closest
// gap if it covers any of the keys in the range [gap.firstKey, maxKey]
const newRemainingKeys: string[] = [];
for (const key of remainingKeys) {
if (closestGap && key >= firstKey && key <= lastKey) {
this._gaps.delete(closestGap);
closestGap = null;
nRemoved += 1;
}
if (key <= firstKey) {
newRemainingKeys.push(key);
}
}
// replace the remaining keys array before looping back
remainingKeys = newRemainingKeys;
}
return nRemoved;
}

/**
* Internal helper to coalesce multiple chained gaps into a single gap.
*
* It is only used to construct lookupGap() return values and
* doesn't modify the GapSet.
*
* NOTE: The function may take a noticeable amount of time and CPU
* to execute if a large number of chained gaps have to be
* coalesced, but it should never take more than a few seconds. In
* most cases it should take less than a millisecond. It regularly
* yields to the nodejs event loop to avoid blocking it during a
* long execution.
*
* @param {GapSetEntry} firstGap - first gap of the chain to coalesce with
* the next ones in the chain
* @return {Promise<GapSetEntry>} - a new coalesced entry, as a Promise
*/
_coalesceGapChain(firstGap: GapSetEntry): Promise<GapSetEntry> {
return new Promise(resolve => {
const coalescedGap: GapSetEntry = Object.assign({}, firstGap);
const coalesceGapChainIteration = () => {
// efficiency trade-off: 100 iterations of log(N) complexity lookups should
// not block the event loop for too long
for (let opCounter = 0; opCounter < 100; ++opCounter) {
const chainedGap = this._gaps.find({ firstKey: coalescedGap.lastKey })?.value;
if (!chainedGap) {
// chain is complete
return resolve(coalescedGap);
}
coalescedGap.lastKey = chainedGap.lastKey;
coalescedGap.weight += chainedGap.weight;
}
// yield to the event loop before continuing the process
// of coalescing the gap chain
return process.nextTick(coalesceGapChainIteration);
};
coalesceGapChainIteration();
});
}

/**
* Lookup the next gap that overlaps with [minKey, maxKey]. Internally chained
* gaps are coalesced in the response into a single contiguous large gap.
*
* @param {string} minKey - minimum key overlapping with the returned gap
* @param {string} maxKey - maximum key overlapping with the returned gap
* @return {Promise<GapSetEntry | null>} - result of the lookup if a gap
* was found, null otherwise, as a Promise
*/
async lookupGap(minKey: string, maxKey: string): Promise<GapSetEntry | null> {
let firstGap: GapSetEntry | null = null;
const minGap = this._gaps.findGreatestLessThanOrEqual({ firstKey: minKey })?.value;
if (minGap && minGap.lastKey >= minKey) {
firstGap = minGap;
} else {
const maxGap = this._gaps.findLeastGreaterThan({ firstKey: minKey })?.value;
if (maxGap && maxGap.firstKey <= maxKey) {
firstGap = maxGap;
}
}
if (!firstGap) {
return null;
}
return this._coalesceGapChain(firstGap);
}

get maxWeight(): number {
return this._maxWeight;
}

get size(): number {
return this._gaps.length;
}

/**
* Iterate over each gap of the set, ordered by first key
*
* @return {Iterator<GapSetEntry>} - an iterator over all gaps
* Example:
* for (const gap of myGapSet) { ... }
*/
[Symbol.iterator](): Iterator<GapSetEntry> {
return this._gaps.iterate();
}

/**
* Return an array containing all gaps, ordered by first key
*
* NOTE: there is a toArray() method in the SortedSet implementation
* but it does not scale well and overflows the stack quickly. This is
* why we provide an implementation based on an iterator.
*
* @return {GapSetEntry[]} - an array containing all gaps
*/
toArray(): GapSetEntry[] {
return [...this];
}
}
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"base-x": "3.0.8",
"base62": "2.0.1",
"bson": "4.0.0",
"collections": "^5.1.13",
"debug": "~2.6.9",
"diskusage": "^1.1.1",
"fcntl": "github:scality/node-fcntl#0.2.2",
Expand Down
Loading

0 comments on commit 0066a29

Please sign in to comment.