Add whoami-based sync worker routing for user-level sticky sessions

This adds a new routing mechanism for sync workers that resolves access tokens
to usernames via Synapse's whoami endpoint, enabling true user-level sticky
routing regardless of which device or token is used.

Previously, sticky routing relied on parsing the username from native Synapse
tokens (`syt_<base64 username>_...`), which only works with native Synapse auth
and provides device-level stickiness at best. This new approach works with any
auth system (native Synapse, MAS, etc.) because Synapse handles token validation
internally.

Implementation uses nginx's auth_request module with an njs script because:
- The whoami lookup requires an async HTTP subrequest (ngx.fetch)
- js_set handlers must return synchronously and don't support async operations
- auth_request allows the async lookup to complete, then captures the result
  via response headers into nginx variables

The njs script:
- Extracts access tokens from Authorization header or query parameter
- Calls Synapse's whoami endpoint to resolve token -> username
- Caches results in a shared memory zone to minimize latency
- Returns the username via a `X-User-Identifier` header

The username is then used by nginx's upstream hash directive for consistent
worker selection. This leverages nginx's built-in health checking and failover.
This commit is contained in:
Slavi Pantaleev
2026-02-04 03:14:47 +02:00
parent 81f815d19b
commit 5cc69ca7eb
6 changed files with 368 additions and 13 deletions

View File

@@ -28,6 +28,7 @@ matrix_synapse_reverse_proxy_companion_version: 1.29.4-alpine
matrix_synapse_reverse_proxy_companion_base_path: "{{ matrix_synapse_base_path }}/reverse-proxy-companion"
matrix_synapse_reverse_proxy_companion_confd_path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/conf.d"
matrix_synapse_reverse_proxy_companion_njs_path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/njs"
# List of systemd services that matrix-synapse-reverse-proxy-companion.service depends on
matrix_synapse_reverse_proxy_companion_systemd_required_services_list: "{{ matrix_synapse_reverse_proxy_companion_systemd_required_services_list_default + matrix_synapse_reverse_proxy_companion_systemd_required_services_list_auto + matrix_synapse_reverse_proxy_companion_systemd_required_services_list_custom }}"
@@ -290,3 +291,77 @@ matrix_synapse_reverse_proxy_companion_synapse_cache_proxy_cache_valid_time: "24
# As such, it trusts the protocol scheme forwarded by the upstream proxy.
matrix_synapse_reverse_proxy_companion_trust_forwarded_proto: true
matrix_synapse_reverse_proxy_companion_x_forwarded_proto_value: "{{ '$http_x_forwarded_proto' if matrix_synapse_reverse_proxy_companion_trust_forwarded_proto else '$scheme' }}"
########################################################################################
# #
# njs module #
# #
########################################################################################
# Controls whether the njs module is loaded.
matrix_synapse_reverse_proxy_companion_njs_enabled: "{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled }}"
########################################################################################
# #
# /njs module #
# #
########################################################################################
########################################################################################
# #
# Whoami-based sync worker routing #
# #
########################################################################################
# Controls whether the whoami-based sync worker router is enabled.
# When enabled, the reverse proxy will call Synapse's /_matrix/client/v3/account/whoami endpoint
# to resolve access tokens to usernames, allowing consistent routing of requests from the same user
# to the same sync worker regardless of which device or token they use.
#
# This works with any authentication system (native Synapse auth, MAS, etc.) because Synapse
# handles the token validation internally.
#
# Without this, sticky routing falls back to parsing the username from the access token (only works
# with native Synapse tokens of the form syt_<base64 username>_...), which only provides
# device-level stickiness (same token -> same worker) rather than user-level stickiness.
#
# Enabled by default when there are sync workers, because sync workers benefit from user-level
# stickiness due to their per-user in-memory caches.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled: "{{ matrix_synapse_reverse_proxy_companion_synapse_workers_list | selectattr('type', 'equalto', 'sync_worker') | list | length > 0 }}"
# The whoami endpoint path (Matrix spec endpoint).
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_endpoint: /_matrix/client/v3/account/whoami
# The full URL to the whoami endpoint.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_url: "http://{{ matrix_synapse_reverse_proxy_companion_client_api_addr }}{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_endpoint }}"
# Cache duration (in seconds) for whoami lookup results.
# Token -> username mappings are cached to avoid repeated whoami calls.
# A longer TTL reduces load on Synapse but means username changes take longer to take effect.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_ttl_seconds: 3600
# Size of the shared memory zone for caching whoami results (in megabytes).
# Each cached entry is approximately 100-200 bytes.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_size_mb: 1
# Controls whether verbose logging is enabled for the whoami sync worker router.
# When enabled, logs cache hits/misses and routing decisions.
# Useful for debugging, but should be disabled in production.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_enabled: false
# The length of the access token to show in logs when logging is enabled.
# Keeping this short is a good idea from a security perspective.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_token_length: 12
# Controls whether debug response headers are added to sync requests.
# When enabled, adds X-Sync-Worker-Router-User-Identifier and X-Sync-Worker-Router-Upstream headers.
# Useful for debugging routing behavior, but should be disabled in production.
matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_debug_headers_enabled: false
########################################################################################
# #
# /Whoami-based sync worker routing #
# #
########################################################################################

View File

@@ -7,14 +7,16 @@
- name: Ensure matrix-synapse-reverse-proxy-companion paths exist
ansible.builtin.file:
path: "{{ item }}"
path: "{{ item.path }}"
state: directory
mode: 0750
owner: "{{ matrix_user_name }}"
group: "{{ matrix_group_name }}"
with_items:
- "{{ matrix_synapse_reverse_proxy_companion_base_path }}"
- "{{ matrix_synapse_reverse_proxy_companion_confd_path }}"
- {path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}", when: true}
- {path: "{{ matrix_synapse_reverse_proxy_companion_confd_path }}", when: true}
- {path: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}", when: "{{ matrix_synapse_reverse_proxy_companion_njs_enabled }}"}
when: item.when | bool
- name: Ensure matrix-synapse-reverse-proxy-companion is configured
ansible.builtin.template:
@@ -33,6 +35,21 @@
- src: "{{ role_path }}/templates/labels.j2"
dest: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/labels"
- name: Ensure matrix-synapse-reverse-proxy-companion whoami sync worker router njs script is deployed
ansible.builtin.template:
src: "{{ role_path }}/templates/nginx/njs/whoami_sync_worker_router.js.j2"
dest: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}/whoami_sync_worker_router.js"
owner: "{{ matrix_user_name }}"
group: "{{ matrix_group_name }}"
mode: 0644
when: matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled | bool
- name: Ensure matrix-synapse-reverse-proxy-companion njs path is removed when njs is disabled
ansible.builtin.file:
path: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}"
state: absent
when: not matrix_synapse_reverse_proxy_companion_njs_enabled
- name: Ensure matrix-synapse-reverse-proxy-companion nginx container image is pulled
community.docker.docker_image:
name: "{{ matrix_synapse_reverse_proxy_companion_container_image }}"

View File

@@ -41,20 +41,48 @@
{% endfor %}
{% endmacro %}
{% macro render_locations_to_upstream_with_whoami_sync_worker_router(locations, upstream_name) %}
{% for location in locations %}
location ~ {{ location }} {
# Use auth_request to call the whoami sync worker router.
# The handler resolves the access token to a user identifier and returns it
# in the X-User-Identifier header, which is then used for upstream hashing.
auth_request /_whoami_sync_worker_router;
auth_request_set $user_identifier $sent_http_x_user_identifier;
{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_debug_headers_enabled %}
add_header X-Sync-Worker-Router-User-Identifier $user_identifier always;
add_header X-Sync-Worker-Router-Upstream $upstream_addr always;
{% endif %}
proxy_pass http://{{ upstream_name }}$request_uri;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
{% endfor %}
{% endmacro %}
{% if matrix_synapse_reverse_proxy_companion_synapse_workers_enabled %}
# Access token to user identifier mapping logic.
# This is used for sticky routing to ensure requests from the same user are routed to the same worker.
{% if not matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
# Extracts the base64-encoded localpart from native Synapse access tokens.
# Native Synapse tokens have the format: syt_<base64 localpart>_<random>_<crc>
# See: https://github.com/element-hq/synapse/blob/1bddd25a85d82b2ef4a2a42f6ecd476108d7dd96/synapse/handlers/auth.py#L1448-L1459
# Maps from https://tcpipuk.github.io/synapse/deployment/nginx.html#mapsconf
# Client username from access token
# Note: This only works with native Synapse tokens, not with MAS or other auth systems.
map $arg_access_token $accesstoken_from_urlparam {
default $arg_access_token;
"~syt_(?<username>.*?)_.*" $username;
default $arg_access_token;
"~syt_(?<b64localpart>.*?)_.*" $b64localpart;
}
# Client username from MXID
map $http_authorization $mxid_localpart {
default $http_authorization;
"~Bearer syt_(?<username>.*?)_.*" $username;
"" $accesstoken_from_urlparam;
map $http_authorization $user_identifier {
default $http_authorization;
"~Bearer syt_(?<b64localpart>.*?)_.*" $b64localpart;
"" $accesstoken_from_urlparam;
}
{% endif %}
# Whether to upgrade HTTP connection
map $http_upgrade $connection_upgrade {
default upgrade;
@@ -76,7 +104,7 @@ map $request_uri $room_name {
{% endif %}
{% if sync_workers | length > 0 %}
{{- render_worker_upstream('sync_workers_upstream', sync_workers, 'hash $mxid_localpart consistent;') }}
{{- render_worker_upstream('sync_workers_upstream', sync_workers, 'hash $user_identifier consistent;') }}
{% endif %}
{% if client_reader_workers | length > 0 %}
@@ -134,6 +162,17 @@ server {
proxy_max_temp_file_size 0;
proxy_set_header Host $host;
{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
# Internal location for whoami-based sync worker routing.
# This is called via auth_request from sync worker locations.
# The njs handler calls the whoami endpoint to resolve access tokens to usernames,
# then returns the username in the X-User-Identifier header for upstream hashing.
location = /_whoami_sync_worker_router {
internal;
js_content whoami_sync_worker_router.handleAuthRequest;
}
{% endif %}
{% if matrix_synapse_reverse_proxy_companion_synapse_workers_enabled %}
# Client-server overrides — These locations must go to the main Synapse process
location ~ {{ matrix_synapse_reverse_proxy_companion_client_server_main_override_locations_regex }} {
@@ -207,7 +246,11 @@ server {
# sync workers
# https://tcpipuk.github.io/synapse/deployment/workers.html
# https://tcpipuk.github.io/synapse/deployment/nginx.html#locationsconf
{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
{{ render_locations_to_upstream_with_whoami_sync_worker_router(matrix_synapse_reverse_proxy_companion_synapse_sync_worker_client_server_locations, 'sync_workers_upstream') }}
{% else %}
{{ render_locations_to_upstream(matrix_synapse_reverse_proxy_companion_synapse_sync_worker_client_server_locations, 'sync_workers_upstream') }}
{% endif %}
{% endif %}
{% if client_reader_workers | length > 0 %}

View File

@@ -8,6 +8,12 @@
# - various temp paths are changed to `/tmp`, so that a non-root user can write to them
# - the `user` directive was removed, as we don't want nginx to switch users
# load_module directives must be first or nginx will choke with:
# > [emerg] "load_module" directive is specified too late.
{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
load_module modules/ngx_http_js_module.so;
{% endif %}
worker_processes {{ matrix_synapse_reverse_proxy_companion_worker_processes }};
error_log /var/log/nginx/error.log warn;
pid /tmp/nginx.pid;
@@ -22,7 +28,6 @@ events {
{% endfor %}
}
http {
proxy_temp_path /tmp/proxy_temp;
client_body_temp_path /tmp/client_temp;
@@ -33,6 +38,16 @@ http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
js_path /njs/;
{% endif %}
{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
# njs module for whoami-based sync worker routing
js_import whoami_sync_worker_router from whoami_sync_worker_router.js;
js_shared_dict_zone zone=whoami_sync_worker_router_cache:{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_size_mb }}m;
{% endif %}
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';

View File

@@ -0,0 +1,202 @@
#jinja2: lstrip_blocks: True
// Whoami-based sync worker router
//
// This script resolves access tokens to usernames by calling the whoami endpoint.
// Results are cached to minimize latency impact. The username is returned via the
// X-User-Identifier header, which nginx captures and uses for upstream hashing.
//
// This works with any authentication system (native Synapse auth, MAS, etc.) because
// Synapse handles token validation internally.
//
// Why auth_request instead of js_set?
// -----------------------------------
// A simpler approach would be to use js_set to populate a variable (e.g., $user_identifier)
// and then use that variable in an upstream's `hash` directive. However, this doesn't work
// because:
//
// 1. The whoami lookup requires an HTTP subrequest (ngx.fetch), which is asynchronous.
// 2. js_set handlers must return synchronously - nginx's variable evaluation doesn't support
// async operations. Using async functions with js_set causes errors like:
// "async operation inside variable handler"
//
// The auth_request approach solves this by:
// 1. Making a subrequest to an internal location that uses js_content (which supports async)
// 2. Returning the user identifier via a response header (X-User-Identifier)
// 3. Capturing that header with auth_request_set into $user_identifier
// 4. Using $user_identifier in the upstream's hash directive for consistent routing
const WHOAMI_URL = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_url | to_json }};
const CACHE_TTL_MS = {{ (matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_ttl_seconds * 1000) | to_json }};
const LOGGING_ENABLED = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_enabled | to_json }};
const LOGGING_TOKEN_LENGTH = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_token_length | to_json }};
function log(message) {
if (LOGGING_ENABLED) {
// Using WARN level because nginx's error_log is hardcoded to 'warn' and our logs won't be visible otherwise
ngx.log(ngx.WARN, 'whoami_sync_worker_router: ' + message);
}
}
// Truncate token for logging (show first X chars only for security)
function truncateToken(token) {
if (!token || token.length <= LOGGING_TOKEN_LENGTH) {
return token;
}
return token.substring(0, LOGGING_TOKEN_LENGTH) + '...';
}
// Extract token from request (Authorization header or query parameter)
function extractToken(r) {
// Try Authorization header first
const authHeader = r.headersIn['Authorization'];
if (authHeader && authHeader.startsWith('Bearer ')) {
return authHeader.substring(7);
}
// Fall back to access_token query parameter (deprecated in Matrix v1.11, but homeservers must support it)
if (r.args.access_token) {
return r.args.access_token;
}
return null;
}
// Extract localpart from user_id (e.g., "@alice:example.com" -> "alice")
function extractLocalpart(userId) {
if (!userId || !userId.startsWith('@')) {
return null;
}
const colonIndex = userId.indexOf(':');
if (colonIndex === -1) {
return null;
}
return userId.substring(1, colonIndex);
}
// Get cached username for token
function getCachedUsername(token) {
const cache = ngx.shared.whoami_sync_worker_router_cache;
if (!cache) {
return null;
}
const entry = cache.get(token);
if (entry) {
try {
const data = JSON.parse(entry);
if (data.expires > Date.now()) {
log('cache hit for token ' + truncateToken(token) + ' -> ' + data.username);
return data.username;
}
// Expired, remove from cache
log('cache expired for token ' + truncateToken(token));
cache.delete(token);
} catch (e) {
cache.delete(token);
}
}
return null;
}
// Cache username for token
function cacheUsername(token, username) {
const cache = ngx.shared.whoami_sync_worker_router_cache;
if (!cache) {
return;
}
try {
const entry = JSON.stringify({
username: username,
expires: Date.now() + CACHE_TTL_MS
});
cache.set(token, entry);
log('cached token ' + truncateToken(token) + ' -> ' + username);
} catch (e) {
// Cache full or other error, log and continue
ngx.log(ngx.WARN, 'whoami_sync_worker_router: cache error: ' + e.message);
}
}
// Call whoami endpoint to get user_id
async function lookupWhoami(token) {
log('performing whoami lookup for token ' + truncateToken(token));
try {
const response = await ngx.fetch(WHOAMI_URL, {
method: 'GET',
headers: {
'Authorization': 'Bearer ' + token
}
});
if (response.ok) {
const data = await response.json();
if (data.user_id) {
const localpart = extractLocalpart(data.user_id);
log('whoami lookup success: ' + data.user_id + ' -> ' + localpart);
return localpart;
}
} else if (response.status === 401) {
// Token is invalid/expired - this is expected for some requests
log('whoami lookup returned 401 (invalid/expired token)');
return null;
} else {
ngx.log(ngx.WARN, 'whoami_sync_worker_router: whoami returned status ' + response.status);
}
} catch (e) {
ngx.log(ngx.ERR, 'whoami_sync_worker_router: whoami failed: ' + e.message);
}
return null;
}
// Set response header with the user identifier for upstream hashing
function setUserIdentifier(r, identifier) {
log('resolved user identifier: ' + identifier);
r.headersOut['X-User-Identifier'] = identifier;
}
// Main handler for auth_request subrequest.
// Returns 200 with X-User-Identifier header containing the user identifier for upstream hashing.
async function handleAuthRequest(r) {
const token = extractToken(r);
if (!token) {
// No token found (e.g., OPTIONS preflight requests don't include Authorization header).
// We return a random value to distribute these requests across workers.
// Returning an empty string would cause all no-token requests to hash to the same value,
// routing them all to a single worker.
// This doesn't affect the cache since we only cache token -> username mappings.
log('no token found in request, distributing randomly');
setUserIdentifier(r, '_no_token_' + Math.random());
r.return(200);
return;
}
// Check cache first
const cachedUsername = getCachedUsername(token);
if (cachedUsername) {
setUserIdentifier(r, cachedUsername);
r.return(200);
return;
}
// Perform whoami lookup
log('cache miss for token ' + truncateToken(token));
const username = await lookupWhoami(token);
if (username) {
cacheUsername(token, username);
setUserIdentifier(r, username);
r.return(200);
return;
}
// Whoami lookup failed, fall back to using the token itself for hashing.
// This still provides device-level sticky routing (same token -> same worker).
log('whoami lookup failed, falling back to token-based routing');
setUserIdentifier(r, token);
r.return(200);
}
export default { handleAuthRequest };

View File

@@ -36,6 +36,9 @@ ExecStartPre={{ devture_systemd_docker_base_host_command_docker }} create \
{% endif %}
--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_base_path }}/nginx.conf,dst=/etc/nginx/nginx.conf,ro \
--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_confd_path }},dst=/etc/nginx/conf.d,ro \
{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_njs_path }},dst=/njs,ro \
{% endif %}
--label-file={{ matrix_synapse_reverse_proxy_companion_base_path }}/labels \
{% for arg in matrix_synapse_reverse_proxy_companion_container_arguments %}
{{ arg }} \