Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 04cc249

Browse files
authored
Add experimental support for sharding event persister. Again. (#8294)
This is *not* ready for production yet. Caveats: 1. We should write some tests... 2. The stream token that we use for events can get stalled at the minimum position of all writers. This means that new events may not be processed and e.g. sent down sync streams if a writer isn't writing or is slow.
1 parent a9dbe98 commit 04cc249

18 files changed

+211
-80
lines changed

changelog.d/8294.feature

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add experimental support for sharding event persister.

synapse/config/_base.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -832,11 +832,26 @@ class ShardedWorkerHandlingConfig:
832832
def should_handle(self, instance_name: str, key: str) -> bool:
833833
"""Whether this instance is responsible for handling the given key.
834834
"""
835-
836-
# If multiple instances are not defined we always return true.
835+
# If multiple instances are not defined we always return true
837836
if not self.instances or len(self.instances) == 1:
838837
return True
839838

839+
return self.get_instance(key) == instance_name
840+
841+
def get_instance(self, key: str) -> str:
842+
"""Get the instance responsible for handling the given key.
843+
844+
Note: For things like federation sending the config for which instance
845+
is sending is known only to the sender instance if there is only one.
846+
Therefore `should_handle` should be used where possible.
847+
"""
848+
849+
if not self.instances:
850+
return "master"
851+
852+
if len(self.instances) == 1:
853+
return self.instances[0]
854+
840855
# We shard by taking the hash, modulo it by the number of instances and
841856
# then checking whether this instance matches the instance at that
842857
# index.
@@ -846,7 +861,7 @@ def should_handle(self, instance_name: str, key: str) -> bool:
846861
dest_hash = sha256(key.encode("utf8")).digest()
847862
dest_int = int.from_bytes(dest_hash, byteorder="little")
848863
remainder = dest_int % (len(self.instances))
849-
return self.instances[remainder] == instance_name
864+
return self.instances[remainder]
850865

851866

852867
__all__ = ["Config", "RootConfig", "ShardedWorkerHandlingConfig"]

synapse/config/_base.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,4 @@ class ShardedWorkerHandlingConfig:
142142
instances: List[str]
143143
def __init__(self, instances: List[str]) -> None: ...
144144
def should_handle(self, instance_name: str, key: str) -> bool: ...
145+
def get_instance(self, key: str) -> str: ...

synapse/config/workers.py

+27-10
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,24 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
from typing import List, Union
17+
1618
import attr
1719

1820
from ._base import Config, ConfigError, ShardedWorkerHandlingConfig
1921
from .server import ListenerConfig, parse_listener_def
2022

2123

24+
def _instance_to_list_converter(obj: Union[str, List[str]]) -> List[str]:
25+
"""Helper for allowing parsing a string or list of strings to a config
26+
option expecting a list of strings.
27+
"""
28+
29+
if isinstance(obj, str):
30+
return [obj]
31+
return obj
32+
33+
2234
@attr.s
2335
class InstanceLocationConfig:
2436
"""The host and port to talk to an instance via HTTP replication.
@@ -33,11 +45,13 @@ class WriterLocations:
3345
"""Specifies the instances that write various streams.
3446
3547
Attributes:
36-
events: The instance that writes to the event and backfill streams.
37-
events: The instance that writes to the typing stream.
48+
events: The instances that write to the event and backfill streams.
49+
typing: The instance that writes to the typing stream.
3850
"""
3951

40-
events = attr.ib(default="master", type=str)
52+
events = attr.ib(
53+
default=["master"], type=List[str], converter=_instance_to_list_converter
54+
)
4155
typing = attr.ib(default="master", type=str)
4256

4357

@@ -105,15 +119,18 @@ def read_config(self, config, **kwargs):
105119
writers = config.get("stream_writers") or {}
106120
self.writers = WriterLocations(**writers)
107121

108-
# Check that the configured writer for events and typing also appears in
122+
# Check that the configured writers for events and typing also appears in
109123
# `instance_map`.
110124
for stream in ("events", "typing"):
111-
instance = getattr(self.writers, stream)
112-
if instance != "master" and instance not in self.instance_map:
113-
raise ConfigError(
114-
"Instance %r is configured to write %s but does not appear in `instance_map` config."
115-
% (instance, stream)
116-
)
125+
instances = _instance_to_list_converter(getattr(self.writers, stream))
126+
for instance in instances:
127+
if instance != "master" and instance not in self.instance_map:
128+
raise ConfigError(
129+
"Instance %r is configured to write %s but does not appear in `instance_map` config."
130+
% (instance, stream)
131+
)
132+
133+
self.events_shard_config = ShardedWorkerHandlingConfig(self.writers.events)
117134

118135
def generate_config_section(self, config_dir_path, server_name, **kwargs):
119136
return """\

synapse/handlers/federation.py

+30-14
Original file line numberDiff line numberDiff line change
@@ -896,7 +896,8 @@ async def backfill(self, dest, room_id, limit, extremities):
896896
)
897897
)
898898

899-
await self._handle_new_events(dest, ev_infos, backfilled=True)
899+
if ev_infos:
900+
await self._handle_new_events(dest, room_id, ev_infos, backfilled=True)
900901

901902
# Step 2: Persist the rest of the events in the chunk one by one
902903
events.sort(key=lambda e: e.depth)
@@ -1189,7 +1190,7 @@ async def get_event(event_id: str):
11891190
event_infos.append(_NewEventInfo(event, None, auth))
11901191

11911192
await self._handle_new_events(
1192-
destination, event_infos,
1193+
destination, room_id, event_infos,
11931194
)
11941195

11951196
def _sanity_check_event(self, ev):
@@ -1336,15 +1337,15 @@ async def do_invite_join(
13361337
)
13371338

13381339
max_stream_id = await self._persist_auth_tree(
1339-
origin, auth_chain, state, event, room_version_obj
1340+
origin, room_id, auth_chain, state, event, room_version_obj
13401341
)
13411342

13421343
# We wait here until this instance has seen the events come down
13431344
# replication (if we're using replication) as the below uses caches.
1344-
#
1345-
# TODO: Currently the events stream is written to from master
13461345
await self._replication.wait_for_stream_position(
1347-
self.config.worker.writers.events, "events", max_stream_id
1346+
self.config.worker.events_shard_config.get_instance(room_id),
1347+
"events",
1348+
max_stream_id,
13481349
)
13491350

13501351
# Check whether this room is the result of an upgrade of a room we already know
@@ -1593,7 +1594,7 @@ async def on_invite_request(
15931594
)
15941595

15951596
context = await self.state_handler.compute_event_context(event)
1596-
await self.persist_events_and_notify([(event, context)])
1597+
await self.persist_events_and_notify(event.room_id, [(event, context)])
15971598

15981599
return event
15991600

@@ -1620,7 +1621,9 @@ async def do_remotely_reject_invite(
16201621
await self.federation_client.send_leave(host_list, event)
16211622

16221623
context = await self.state_handler.compute_event_context(event)
1623-
stream_id = await self.persist_events_and_notify([(event, context)])
1624+
stream_id = await self.persist_events_and_notify(
1625+
event.room_id, [(event, context)]
1626+
)
16241627

16251628
return event, stream_id
16261629

@@ -1868,7 +1871,7 @@ async def _handle_new_event(
18681871
)
18691872

18701873
await self.persist_events_and_notify(
1871-
[(event, context)], backfilled=backfilled
1874+
event.room_id, [(event, context)], backfilled=backfilled
18721875
)
18731876
except Exception:
18741877
run_in_background(
@@ -1881,6 +1884,7 @@ async def _handle_new_event(
18811884
async def _handle_new_events(
18821885
self,
18831886
origin: str,
1887+
room_id: str,
18841888
event_infos: Iterable[_NewEventInfo],
18851889
backfilled: bool = False,
18861890
) -> None:
@@ -1912,6 +1916,7 @@ async def prep(ev_info: _NewEventInfo):
19121916
)
19131917

19141918
await self.persist_events_and_notify(
1919+
room_id,
19151920
[
19161921
(ev_info.event, context)
19171922
for ev_info, context in zip(event_infos, contexts)
@@ -1922,6 +1927,7 @@ async def prep(ev_info: _NewEventInfo):
19221927
async def _persist_auth_tree(
19231928
self,
19241929
origin: str,
1930+
room_id: str,
19251931
auth_events: List[EventBase],
19261932
state: List[EventBase],
19271933
event: EventBase,
@@ -1936,6 +1942,7 @@ async def _persist_auth_tree(
19361942
19371943
Args:
19381944
origin: Where the events came from
1945+
room_id,
19391946
auth_events
19401947
state
19411948
event
@@ -2010,17 +2017,20 @@ async def _persist_auth_tree(
20102017
events_to_context[e.event_id].rejected = RejectedReason.AUTH_ERROR
20112018

20122019
await self.persist_events_and_notify(
2020+
room_id,
20132021
[
20142022
(e, events_to_context[e.event_id])
20152023
for e in itertools.chain(auth_events, state)
2016-
]
2024+
],
20172025
)
20182026

20192027
new_event_context = await self.state_handler.compute_event_context(
20202028
event, old_state=state
20212029
)
20222030

2023-
return await self.persist_events_and_notify([(event, new_event_context)])
2031+
return await self.persist_events_and_notify(
2032+
room_id, [(event, new_event_context)]
2033+
)
20242034

20252035
async def _prep_event(
20262036
self,
@@ -2871,21 +2881,27 @@ async def _check_key_revocation(self, public_key, url):
28712881

28722882
async def persist_events_and_notify(
28732883
self,
2884+
room_id: str,
28742885
event_and_contexts: Sequence[Tuple[EventBase, EventContext]],
28752886
backfilled: bool = False,
28762887
) -> int:
28772888
"""Persists events and tells the notifier/pushers about them, if
28782889
necessary.
28792890
28802891
Args:
2881-
event_and_contexts:
2892+
room_id: The room ID of events being persisted.
2893+
event_and_contexts: Sequence of events with their associated
2894+
context that should be persisted. All events must belong to
2895+
the same room.
28822896
backfilled: Whether these events are a result of
28832897
backfilling or not
28842898
"""
2885-
if self.config.worker.writers.events != self._instance_name:
2899+
instance = self.config.worker.events_shard_config.get_instance(room_id)
2900+
if instance != self._instance_name:
28862901
result = await self._send_events(
2887-
instance_name=self.config.worker.writers.events,
2902+
instance_name=instance,
28882903
store=self.store,
2904+
room_id=room_id,
28892905
event_and_contexts=event_and_contexts,
28902906
backfilled=backfilled,
28912907
)

synapse/handlers/message.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -376,9 +376,8 @@ def __init__(self, hs: "HomeServer"):
376376
self.notifier = hs.get_notifier()
377377
self.config = hs.config
378378
self.require_membership_for_aliases = hs.config.require_membership_for_aliases
379-
self._is_event_writer = (
380-
self.config.worker.writers.events == hs.get_instance_name()
381-
)
379+
self._events_shard_config = self.config.worker.events_shard_config
380+
self._instance_name = hs.get_instance_name()
382381

383382
self.room_invite_state_types = self.hs.config.room_invite_state_types
384383

@@ -902,9 +901,10 @@ async def handle_new_client_event(
902901

903902
try:
904903
# If we're a worker we need to hit out to the master.
905-
if not self._is_event_writer:
904+
writer_instance = self._events_shard_config.get_instance(event.room_id)
905+
if writer_instance != self._instance_name:
906906
result = await self.send_event(
907-
instance_name=self.config.worker.writers.events,
907+
instance_name=writer_instance,
908908
event_id=event.event_id,
909909
store=self.store,
910910
requester=requester,
@@ -972,8 +972,10 @@ async def persist_and_notify_client_event(
972972
973973
This should only be run on the instance in charge of persisting events.
974974
"""
975-
assert self._is_event_writer
976975
assert self.storage.persistence is not None
976+
assert self._events_shard_config.should_handle(
977+
self._instance_name, event.room_id
978+
)
977979

978980
if ratelimit:
979981
# We check if this is a room admin redacting an event so that we

synapse/handlers/room.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,9 @@ async def create_room(
804804

805805
# Always wait for room creation to progate before returning
806806
await self._replication.wait_for_stream_position(
807-
self.hs.config.worker.writers.events, "events", last_stream_id
807+
self.hs.config.worker.events_shard_config.get_instance(room_id),
808+
"events",
809+
last_stream_id,
808810
)
809811

810812
return result, last_stream_id
@@ -1259,10 +1261,10 @@ async def shutdown_room(
12591261
# We now wait for the create room to come back in via replication so
12601262
# that we can assume that all the joins/invites have propogated before
12611263
# we try and auto join below.
1262-
#
1263-
# TODO: Currently the events stream is written to from master
12641264
await self._replication.wait_for_stream_position(
1265-
self.hs.config.worker.writers.events, "events", stream_id
1265+
self.hs.config.worker.events_shard_config.get_instance(new_room_id),
1266+
"events",
1267+
stream_id,
12661268
)
12671269
else:
12681270
new_room_id = None
@@ -1292,7 +1294,9 @@ async def shutdown_room(
12921294

12931295
# Wait for leave to come in over replication before trying to forget.
12941296
await self._replication.wait_for_stream_position(
1295-
self.hs.config.worker.writers.events, "events", stream_id
1297+
self.hs.config.worker.events_shard_config.get_instance(room_id),
1298+
"events",
1299+
stream_id,
12961300
)
12971301

12981302
await self.room_member_handler.forget(target_requester.user, room_id)

synapse/handlers/room_member.py

-7
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,6 @@ def __init__(self, hs: "HomeServer"):
8282
self._enable_lookup = hs.config.enable_3pid_lookup
8383
self.allow_per_room_profiles = self.config.allow_per_room_profiles
8484

85-
self._event_stream_writer_instance = hs.config.worker.writers.events
86-
self._is_on_event_persistence_instance = (
87-
self._event_stream_writer_instance == hs.get_instance_name()
88-
)
89-
if self._is_on_event_persistence_instance:
90-
self.persist_event_storage = hs.get_storage().persistence
91-
9285
self._join_rate_limiter_local = Ratelimiter(
9386
clock=self.clock,
9487
rate_hz=hs.config.ratelimiting.rc_joins_local.per_second,

synapse/replication/http/federation.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,11 @@ def __init__(self, hs):
6565
self.federation_handler = hs.get_handlers().federation_handler
6666

6767
@staticmethod
68-
async def _serialize_payload(store, event_and_contexts, backfilled):
68+
async def _serialize_payload(store, room_id, event_and_contexts, backfilled):
6969
"""
7070
Args:
7171
store
72+
room_id (str)
7273
event_and_contexts (list[tuple[FrozenEvent, EventContext]])
7374
backfilled (bool): Whether or not the events are the result of
7475
backfilling
@@ -88,14 +89,19 @@ async def _serialize_payload(store, event_and_contexts, backfilled):
8889
}
8990
)
9091

91-
payload = {"events": event_payloads, "backfilled": backfilled}
92+
payload = {
93+
"events": event_payloads,
94+
"backfilled": backfilled,
95+
"room_id": room_id,
96+
}
9297

9398
return payload
9499

95100
async def _handle_request(self, request):
96101
with Measure(self.clock, "repl_fed_send_events_parse"):
97102
content = parse_json_object_from_request(request)
98103

104+
room_id = content["room_id"]
99105
backfilled = content["backfilled"]
100106

101107
event_payloads = content["events"]
@@ -120,7 +126,7 @@ async def _handle_request(self, request):
120126
logger.info("Got %d events from federation", len(event_and_contexts))
121127

122128
max_stream_id = await self.federation_handler.persist_events_and_notify(
123-
event_and_contexts, backfilled
129+
room_id, event_and_contexts, backfilled
124130
)
125131

126132
return 200, {"max_stream_id": max_stream_id}

0 commit comments

Comments
 (0)