Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 36efbca

Browse files
authored
Catch-up after Federation Outage (bonus): Catch-up on Synapse Startup (#8322)
Signed-off-by: Olivier Wilkinson (reivilibre) <[email protected]> Co-authored-by: Patrick Cloke <[email protected]> * Fix _set_destination_retry_timings This came about because the code assumed that retry_interval could not be NULL — which has been challenged by catch-up.
1 parent 8a4a418 commit 36efbca

File tree

10 files changed

+218
-5
lines changed

10 files changed

+218
-5
lines changed

changelog.d/8230.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix messages over federation being lost until an event is sent into the same room.

changelog.d/8230.misc

-1
This file was deleted.

changelog.d/8247.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix messages over federation being lost until an event is sent into the same room.

changelog.d/8247.misc

-1
This file was deleted.

changelog.d/8258.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix messages over federation being lost until an event is sent into the same room.

changelog.d/8258.misc

-1
This file was deleted.

changelog.d/8322.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix messages over federation being lost until an event is sent into the same room.

synapse/federation/sender/__init__.py

+51
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,15 @@
5555
"Total number of PDUs queued for sending across all destinations",
5656
)
5757

58+
# Time (in s) after Synapse's startup that we will begin to wake up destinations
59+
# that have catch-up outstanding.
60+
CATCH_UP_STARTUP_DELAY_SEC = 15
61+
62+
# Time (in s) to wait in between waking up each destination, i.e. one destination
63+
# will be woken up every <x> seconds after Synapse's startup until we have woken
64+
# every destination has outstanding catch-up.
65+
CATCH_UP_STARTUP_INTERVAL_SEC = 5
66+
5867

5968
class FederationSender:
6069
def __init__(self, hs: "synapse.server.HomeServer"):
@@ -125,6 +134,14 @@ def __init__(self, hs: "synapse.server.HomeServer"):
125134
1000.0 / hs.config.federation_rr_transactions_per_room_per_second
126135
)
127136

137+
# wake up destinations that have outstanding PDUs to be caught up
138+
self._catchup_after_startup_timer = self.clock.call_later(
139+
CATCH_UP_STARTUP_DELAY_SEC,
140+
run_as_background_process,
141+
"wake_destinations_needing_catchup",
142+
self._wake_destinations_needing_catchup,
143+
)
144+
128145
def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue:
129146
"""Get or create a PerDestinationQueue for the given destination
130147
@@ -560,3 +577,37 @@ async def get_replication_rows(
560577
# Dummy implementation for case where federation sender isn't offloaded
561578
# to a worker.
562579
return [], 0, False
580+
581+
async def _wake_destinations_needing_catchup(self):
582+
"""
583+
Wakes up destinations that need catch-up and are not currently being
584+
backed off from.
585+
586+
In order to reduce load spikes, adds a delay between each destination.
587+
"""
588+
589+
last_processed = None # type: Optional[str]
590+
591+
while True:
592+
destinations_to_wake = await self.store.get_catch_up_outstanding_destinations(
593+
last_processed
594+
)
595+
596+
if not destinations_to_wake:
597+
# finished waking all destinations!
598+
self._catchup_after_startup_timer = None
599+
break
600+
601+
destinations_to_wake = [
602+
d
603+
for d in destinations_to_wake
604+
if self._federation_shard_config.should_handle(self._instance_name, d)
605+
]
606+
607+
for last_processed in destinations_to_wake:
608+
logger.info(
609+
"Destination %s has outstanding catch-up, waking up.",
610+
last_processed,
611+
)
612+
self.wake_destination(last_processed)
613+
await self.clock.sleep(CATCH_UP_STARTUP_INTERVAL_SEC)

synapse/storage/databases/main/transactions.py

+64-2
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ def _set_destination_retry_timings(
218218
retry_interval = EXCLUDED.retry_interval
219219
WHERE
220220
EXCLUDED.retry_interval = 0
221+
OR destinations.retry_interval IS NULL
221222
OR destinations.retry_interval < EXCLUDED.retry_interval
222223
"""
223224

@@ -249,7 +250,11 @@ def _set_destination_retry_timings(
249250
"retry_interval": retry_interval,
250251
},
251252
)
252-
elif retry_interval == 0 or prev_row["retry_interval"] < retry_interval:
253+
elif (
254+
retry_interval == 0
255+
or prev_row["retry_interval"] is None
256+
or prev_row["retry_interval"] < retry_interval
257+
):
253258
self.db_pool.simple_update_one_txn(
254259
txn,
255260
"destinations",
@@ -397,7 +402,7 @@ async def get_catch_up_room_event_ids(
397402

398403
@staticmethod
399404
def _get_catch_up_room_event_ids_txn(
400-
txn, destination: str, last_successful_stream_ordering: int,
405+
txn: LoggingTransaction, destination: str, last_successful_stream_ordering: int,
401406
) -> List[str]:
402407
q = """
403408
SELECT event_id FROM destination_rooms
@@ -412,3 +417,60 @@ def _get_catch_up_room_event_ids_txn(
412417
)
413418
event_ids = [row[0] for row in txn]
414419
return event_ids
420+
421+
async def get_catch_up_outstanding_destinations(
422+
self, after_destination: Optional[str]
423+
) -> List[str]:
424+
"""
425+
Gets at most 25 destinations which have outstanding PDUs to be caught up,
426+
and are not being backed off from
427+
Args:
428+
after_destination:
429+
If provided, all destinations must be lexicographically greater
430+
than this one.
431+
432+
Returns:
433+
list of up to 25 destinations with outstanding catch-up.
434+
These are the lexicographically first destinations which are
435+
lexicographically greater than after_destination (if provided).
436+
"""
437+
time = self.hs.get_clock().time_msec()
438+
439+
return await self.db_pool.runInteraction(
440+
"get_catch_up_outstanding_destinations",
441+
self._get_catch_up_outstanding_destinations_txn,
442+
time,
443+
after_destination,
444+
)
445+
446+
@staticmethod
447+
def _get_catch_up_outstanding_destinations_txn(
448+
txn: LoggingTransaction, now_time_ms: int, after_destination: Optional[str]
449+
) -> List[str]:
450+
q = """
451+
SELECT destination FROM destinations
452+
WHERE destination IN (
453+
SELECT destination FROM destination_rooms
454+
WHERE destination_rooms.stream_ordering >
455+
destinations.last_successful_stream_ordering
456+
)
457+
AND destination > ?
458+
AND (
459+
retry_last_ts IS NULL OR
460+
retry_last_ts + retry_interval < ?
461+
)
462+
ORDER BY destination
463+
LIMIT 25
464+
"""
465+
txn.execute(
466+
q,
467+
(
468+
# everything is lexicographically greater than "" so this gives
469+
# us the first batch of up to 25.
470+
after_destination or "",
471+
now_time_ms,
472+
),
473+
)
474+
475+
destinations = [row[0] for row in txn]
476+
return destinations

tests/federation/test_federation_catch_up.py

+99
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,102 @@ def test_catch_up_loop(self):
321321
per_dest_queue._last_successful_stream_ordering,
322322
event_5.internal_metadata.stream_ordering,
323323
)
324+
325+
@override_config({"send_federation": True})
326+
def test_catch_up_on_synapse_startup(self):
327+
"""
328+
Tests the behaviour of get_catch_up_outstanding_destinations and
329+
_wake_destinations_needing_catchup.
330+
"""
331+
332+
# list of sorted server names (note that there are more servers than the batch
333+
# size used in get_catch_up_outstanding_destinations).
334+
server_names = ["server%02d" % number for number in range(42)] + ["zzzerver"]
335+
336+
# ARRANGE:
337+
# - a local user (u1)
338+
# - a room which u1 is joined to (and remote users @user:serverXX are
339+
# joined to)
340+
341+
# mark the remotes as online
342+
self.is_online = True
343+
344+
self.register_user("u1", "you the one")
345+
u1_token = self.login("u1", "you the one")
346+
room_id = self.helper.create_room_as("u1", tok=u1_token)
347+
348+
for server_name in server_names:
349+
self.get_success(
350+
event_injection.inject_member_event(
351+
self.hs, room_id, "@user:%s" % server_name, "join"
352+
)
353+
)
354+
355+
# create an event
356+
self.helper.send(room_id, "deary me!", tok=u1_token)
357+
358+
# ASSERT:
359+
# - All servers are up to date so none should have outstanding catch-up
360+
outstanding_when_successful = self.get_success(
361+
self.hs.get_datastore().get_catch_up_outstanding_destinations(None)
362+
)
363+
self.assertEqual(outstanding_when_successful, [])
364+
365+
# ACT:
366+
# - Make the remote servers unreachable
367+
self.is_online = False
368+
369+
# - Mark zzzerver as being backed-off from
370+
now = self.clock.time_msec()
371+
self.get_success(
372+
self.hs.get_datastore().set_destination_retry_timings(
373+
"zzzerver", now, now, 24 * 60 * 60 * 1000 # retry in 1 day
374+
)
375+
)
376+
377+
# - Send an event
378+
self.helper.send(room_id, "can anyone hear me?", tok=u1_token)
379+
380+
# ASSERT (get_catch_up_outstanding_destinations):
381+
# - all remotes are outstanding
382+
# - they are returned in batches of 25, in order
383+
outstanding_1 = self.get_success(
384+
self.hs.get_datastore().get_catch_up_outstanding_destinations(None)
385+
)
386+
387+
self.assertEqual(len(outstanding_1), 25)
388+
self.assertEqual(outstanding_1, server_names[0:25])
389+
390+
outstanding_2 = self.get_success(
391+
self.hs.get_datastore().get_catch_up_outstanding_destinations(
392+
outstanding_1[-1]
393+
)
394+
)
395+
self.assertNotIn("zzzerver", outstanding_2)
396+
self.assertEqual(len(outstanding_2), 17)
397+
self.assertEqual(outstanding_2, server_names[25:-1])
398+
399+
# ACT: call _wake_destinations_needing_catchup
400+
401+
# patch wake_destination to just count the destinations instead
402+
woken = []
403+
404+
def wake_destination_track(destination):
405+
woken.append(destination)
406+
407+
self.hs.get_federation_sender().wake_destination = wake_destination_track
408+
409+
# cancel the pre-existing timer for _wake_destinations_needing_catchup
410+
# this is because we are calling it manually rather than waiting for it
411+
# to be called automatically
412+
self.hs.get_federation_sender()._catchup_after_startup_timer.cancel()
413+
414+
self.get_success(
415+
self.hs.get_federation_sender()._wake_destinations_needing_catchup(), by=5.0
416+
)
417+
418+
# ASSERT (_wake_destinations_needing_catchup):
419+
# - all remotes are woken up, save for zzzerver
420+
self.assertNotIn("zzzerver", woken)
421+
# - all destinations are woken exactly once; they appear once in woken.
422+
self.assertCountEqual(woken, server_names[:-1])

0 commit comments

Comments
 (0)