Skip to content

Commit b2b336f

Browse files
committed
fix(swingset): workaround XS garbage collection bugs
DO NOT MERGE INTO MAIN BRANCH Workaround for #6588 During transcript replay, handle divergent syscalls which retrieve stable Virtual Collection metadata. These can happen at different times than recorded during the transcript because of some bugs in the XS engine making GC sensitive to reload from heap snapshots.
1 parent 2c812d2 commit b2b336f

File tree

3 files changed

+121
-10
lines changed

3 files changed

+121
-10
lines changed

packages/SwingSet/src/kernel/vat-loader/manager-helper.js

+5-3
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ import { makeTranscriptManager } from './transcript.js';
103103
* @param {KernelSlog} kernelSlog
104104
* @param {(vso: VatSyscallObject) => VatSyscallResult} vatSyscallHandler
105105
* @param {boolean} workerCanBlock
106-
* @param {(vatID: any, originalSyscall: any, newSyscall: any) => Error | undefined} [compareSyscalls]
106+
* @param {(vatID: any, originalSyscall: any, newSyscall: any) => import('./transcript.js').CompareSyscallsResult} [compareSyscalls]
107107
* @param {boolean} [useTranscript]
108108
* @returns {ManagerKit}
109109
*/
@@ -247,7 +247,9 @@ function makeManagerKit(
247247
// but if the puppy deviates one inch from previous twitches, explode
248248
kernelSlog.syscall(vatID, undefined, vso);
249249
const vres = transcriptManager.simulateSyscall(vso);
250-
return vres;
250+
if (vres) {
251+
return vres;
252+
}
251253
}
252254

253255
const vres = vatSyscallHandler(vso);
@@ -256,7 +258,7 @@ function makeManagerKit(
256258
if (successFlag === 'ok' && data && !workerCanBlock) {
257259
console.log(`warning: syscall returns data, but worker cannot get it`);
258260
}
259-
if (transcriptManager) {
261+
if (transcriptManager && !transcriptManager.inReplay()) {
260262
transcriptManager.addSyscall(vso, vres);
261263
}
262264
return vres;

packages/SwingSet/src/kernel/vat-loader/manager-subprocess-xsnap.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import { assert, details as X, q } from '@agoric/assert';
33
import { ExitCode } from '@agoric/xsnap/api.js';
44
import { makeManagerKit } from './manager-helper.js';
5+
import { requireIdenticalExceptStableVCSyscalls } from './transcript.js';
56

67
import {
78
insistVatSyscallObject,
@@ -55,7 +56,7 @@ export function makeXsSubprocessFactory({
5556
const {
5657
name: vatName,
5758
metered,
58-
compareSyscalls,
59+
compareSyscalls = requireIdenticalExceptStableVCSyscalls,
5960
useTranscript,
6061
sourcedConsole,
6162
} = managerOptions;

packages/SwingSet/src/kernel/vat-loader/transcript.js

+114-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
import djson from '../../lib/djson.js';
22

3+
// Indicate that a syscall is missing from the transcript but is safe to
4+
// perform during replay
5+
const missingSyscall = Symbol('missing transcript syscall');
6+
7+
// Indicate that a syscall is recorded in the transcript but can be safely
8+
// ignored / skipped during replay.
9+
const extraSyscall = Symbol('extra transcript syscall');
10+
11+
/** @typedef {typeof missingSyscall | typeof extraSyscall | Error | undefined} CompareSyscallsResult */
12+
13+
/**
14+
* @param {any} vatID
15+
* @param {object} originalSyscall
16+
* @param {object} newSyscall
17+
* @returns {CompareSyscallsResult}
18+
*/
319
export function requireIdentical(vatID, originalSyscall, newSyscall) {
420
if (djson.stringify(originalSyscall) !== djson.stringify(newSyscall)) {
521
console.error(`anachrophobia strikes vat ${vatID}`);
@@ -10,6 +26,67 @@ export function requireIdentical(vatID, originalSyscall, newSyscall) {
1026
return undefined;
1127
}
1228

29+
const vcSyscallRE = /^vc\.\d+\.\|(?:schemata|label)$/;
30+
31+
/**
32+
* Liveslots currently has a deficiency which results in [virtual collections
33+
* being sensitive to organic GC](https://github.com/Agoric/agoric-sdk/issues/6360).
34+
*
35+
* XS also has multiple issues causing memory to not be collected identically
36+
* depending on the load from snapshot schedule. This results in organic GC
37+
* triggering at different times based on which snapshot the worker was created
38+
* from.
39+
*
40+
* Combined together, these bugs cause syscalls being emitted by liveslots at
41+
* different times whether the execution occurred in a worker created from a
42+
* more or less recent snapshot. With a strict check during transcript replay,
43+
* this can cause [anachrophobia errors when restarting SwingSet](https://github.com/Agoric/agoric-sdk/issues/6588),
44+
* or potentially when reloading a vat that was paged out.
45+
*
46+
* Thankfully the syscalls issued by liveslots for these virtual collection
47+
* objects are both easily identifiable and stable over time. That means their
48+
* response is always the same regardless when the syscall is made.
49+
*
50+
* This method enhances the basic identical check and returns sentinel values
51+
* (unique symbols), indicating whether a syscall during replay requires to
52+
* skip an entry from the transcript or perform the actual syscall because the
53+
* entry is missing in the transcript. This works in conjunction with
54+
* `simulateSyscall` which then performs the appropriate action.
55+
*
56+
* @param {any} vatID
57+
* @param {object} originalSyscall
58+
* @param {object} newSyscall
59+
* @returns {CompareSyscallsResult}
60+
*/
61+
export function requireIdenticalExceptStableVCSyscalls(
62+
vatID,
63+
originalSyscall,
64+
newSyscall,
65+
) {
66+
const error = requireIdentical(vatID, originalSyscall, newSyscall);
67+
68+
if (error) {
69+
if (
70+
originalSyscall[0] === 'vatstoreGet' &&
71+
vcSyscallRE.test(originalSyscall[1])
72+
) {
73+
// The syscall recorded in the transcript is for a virtual collection
74+
// metadata get. It can be safely skipped.
75+
console.warn(` mitigation: ignoring extra vc syscall`);
76+
return extraSyscall;
77+
}
78+
79+
if (newSyscall[0] === 'vatstoreGet' && vcSyscallRE.test(newSyscall[1])) {
80+
// The syscall performed by the vat is for a virtual collection metadata
81+
// get. It can be safely performed during replay.
82+
console.warn(` mitigation: falling through to syscall handler`);
83+
return missingSyscall;
84+
}
85+
}
86+
87+
return error;
88+
}
89+
1390
export function makeTranscriptManager(
1491
vatKeeper,
1592
vatID,
@@ -59,13 +136,44 @@ export function makeTranscriptManager(
59136
let replayError;
60137

61138
function simulateSyscall(newSyscall) {
62-
const s = playbackSyscalls.shift();
63-
const newReplayError = compareSyscalls(vatID, s.d, newSyscall);
64-
if (newReplayError) {
65-
replayError = newReplayError;
66-
throw replayError;
139+
while (playbackSyscalls.length) {
140+
const compareError = compareSyscalls(
141+
vatID,
142+
playbackSyscalls[0].d,
143+
newSyscall,
144+
);
145+
146+
if (compareError === missingSyscall) {
147+
// return `undefined` to indicate that this syscall cannot be simulated
148+
// and needs to be performed (virtual collection metadata get)
149+
return undefined;
150+
}
151+
152+
const s = playbackSyscalls.shift();
153+
154+
if (!compareError) {
155+
return s.response;
156+
} else if (compareError !== extraSyscall) {
157+
replayError = compareError;
158+
break;
159+
}
160+
161+
// Check the next transcript entry, skipping any extra syscalls recorded
162+
// in the transcript (virtual collection metadata get)
163+
}
164+
165+
if (!replayError) {
166+
// Error if the vat performs a syscall for which we don't have a
167+
// corresponding entry in the transcript.
168+
169+
// Note that if a vat performed an "allowed" vc metadata get syscall after
170+
// we reach the end of the transcript, we would error instead of
171+
// falling through and performing the syscall. However liveslots does not
172+
// perform vc metadata get syscalls unless it needs to provide an entry
173+
// to the program, which always results in subsequent syscalls.
174+
replayError = new Error(`historical inaccuracy in replay of ${vatID}`);
67175
}
68-
return s.response;
176+
throw replayError;
69177
}
70178

71179
function finishReplayDelivery(dnum) {

0 commit comments

Comments
 (0)