Add whoami-based sync worker routing for user-level sticky sessions

This adds a new routing mechanism for sync workers that resolves access tokens to usernames via Synapse's whoami endpoint, enabling true user-level sticky routing regardless of which device or token is used. Previously, sticky routing relied on parsing the username from native Synapse tokens (`syt_<base64 username>_...`), which only works with native Synapse auth and provides device-level stickiness at best. This new approach works with any auth system (native Synapse, MAS, etc.) because Synapse handles token validation internally. Implementation uses nginx's auth_request module with an njs script because: - The whoami lookup requires an async HTTP subrequest (ngx.fetch) - js_set handlers must return synchronously and don't support async operations - auth_request allows the async lookup to complete, then captures the result via response headers into nginx variables The njs script: - Extracts access tokens from Authorization header or query parameter - Calls Synapse's whoami endpoint to resolve token -> username - Caches results in a shared memory zone to minimize latency - Returns the username via a `X-User-Identifier` header The username is then used by nginx's upstream hash directive for consistent worker selection. This leverages nginx's built-in health checking and failover.
2026-02-07 22:43:10 +03:00 · 2026-02-04 03:14:47 +02:00
parent 81f815d19b
commit 5cc69ca7eb
6 changed files with 368 additions and 13 deletions
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/defaults/main.yml
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/defaults/main.yml
@@ -28,6 +28,7 @@ matrix_synapse_reverse_proxy_companion_version: 1.29.4-alpine

 matrix_synapse_reverse_proxy_companion_base_path: "{{ matrix_synapse_base_path }}/reverse-proxy-companion"
 matrix_synapse_reverse_proxy_companion_confd_path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/conf.d"
+matrix_synapse_reverse_proxy_companion_njs_path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/njs"

 # List of systemd services that matrix-synapse-reverse-proxy-companion.service depends on
 matrix_synapse_reverse_proxy_companion_systemd_required_services_list: "{{ matrix_synapse_reverse_proxy_companion_systemd_required_services_list_default + matrix_synapse_reverse_proxy_companion_systemd_required_services_list_auto + matrix_synapse_reverse_proxy_companion_systemd_required_services_list_custom }}"
@@ -290,3 +291,77 @@ matrix_synapse_reverse_proxy_companion_synapse_cache_proxy_cache_valid_time: "24
 # As such, it trusts the protocol scheme forwarded by the upstream proxy.
 matrix_synapse_reverse_proxy_companion_trust_forwarded_proto: true
 matrix_synapse_reverse_proxy_companion_x_forwarded_proto_value: "{{ '$http_x_forwarded_proto' if matrix_synapse_reverse_proxy_companion_trust_forwarded_proto else '$scheme' }}"
+
+
+########################################################################################
+#                                                                                      #
+# njs module                                                                           #
+#                                                                                      #
+########################################################################################
+
+# Controls whether the njs module is loaded.
+matrix_synapse_reverse_proxy_companion_njs_enabled: "{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled }}"
+
+########################################################################################
+#                                                                                      #
+# /njs module                                                                          #
+#                                                                                      #
+########################################################################################
+
+
+########################################################################################
+#                                                                                      #
+# Whoami-based sync worker routing                                                     #
+#                                                                                      #
+########################################################################################
+
+# Controls whether the whoami-based sync worker router is enabled.
+# When enabled, the reverse proxy will call Synapse's /_matrix/client/v3/account/whoami endpoint
+# to resolve access tokens to usernames, allowing consistent routing of requests from the same user
+# to the same sync worker regardless of which device or token they use.
+#
+# This works with any authentication system (native Synapse auth, MAS, etc.) because Synapse
+# handles the token validation internally.
+#
+# Without this, sticky routing falls back to parsing the username from the access token (only works
+# with native Synapse tokens of the form syt_<base64 username>_...), which only provides
+# device-level stickiness (same token -> same worker) rather than user-level stickiness.
+#
+# Enabled by default when there are sync workers, because sync workers benefit from user-level
+# stickiness due to their per-user in-memory caches.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled: "{{ matrix_synapse_reverse_proxy_companion_synapse_workers_list | selectattr('type', 'equalto', 'sync_worker') | list | length > 0 }}"
+
+# The whoami endpoint path (Matrix spec endpoint).
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_endpoint: /_matrix/client/v3/account/whoami
+
+# The full URL to the whoami endpoint.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_url: "http://{{ matrix_synapse_reverse_proxy_companion_client_api_addr }}{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_endpoint }}"
+
+# Cache duration (in seconds) for whoami lookup results.
+# Token -> username mappings are cached to avoid repeated whoami calls.
+# A longer TTL reduces load on Synapse but means username changes take longer to take effect.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_ttl_seconds: 3600
+
+# Size of the shared memory zone for caching whoami results (in megabytes).
+# Each cached entry is approximately 100-200 bytes.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_size_mb: 1
+
+# Controls whether verbose logging is enabled for the whoami sync worker router.
+# When enabled, logs cache hits/misses and routing decisions.
+# Useful for debugging, but should be disabled in production.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_enabled: false
+
+# The length of the access token to show in logs when logging is enabled.
+# Keeping this short is a good idea from a security perspective.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_token_length: 12
+
+# Controls whether debug response headers are added to sync requests.
+# When enabled, adds X-Sync-Worker-Router-User-Identifier and X-Sync-Worker-Router-Upstream headers.
+# Useful for debugging routing behavior, but should be disabled in production.
+matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_debug_headers_enabled: false
+
+########################################################################################
+#                                                                                      #
+# /Whoami-based sync worker routing                                                    #
+#                                                                                      #
+########################################################################################
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/tasks/setup_install.yml
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/tasks/setup_install.yml
@@ -7,14 +7,16 @@

 - name: Ensure matrix-synapse-reverse-proxy-companion paths exist
  ansible.builtin.file:
-    path: "{{ item }}"
+    path: "{{ item.path }}"
    state: directory
    mode: 0750
    owner: "{{ matrix_user_name }}"
    group: "{{ matrix_group_name }}"
  with_items:
-    - "{{ matrix_synapse_reverse_proxy_companion_base_path }}"
-    - "{{ matrix_synapse_reverse_proxy_companion_confd_path }}"
+    - {path: "{{ matrix_synapse_reverse_proxy_companion_base_path }}", when: true}
+    - {path: "{{ matrix_synapse_reverse_proxy_companion_confd_path }}", when: true}
+    - {path: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}", when: "{{ matrix_synapse_reverse_proxy_companion_njs_enabled }}"}
+  when: item.when | bool

 - name: Ensure matrix-synapse-reverse-proxy-companion is configured
  ansible.builtin.template:
@@ -33,6 +35,21 @@
    - src: "{{ role_path }}/templates/labels.j2"
      dest: "{{ matrix_synapse_reverse_proxy_companion_base_path }}/labels"

+- name: Ensure matrix-synapse-reverse-proxy-companion whoami sync worker router njs script is deployed
+  ansible.builtin.template:
+    src: "{{ role_path }}/templates/nginx/njs/whoami_sync_worker_router.js.j2"
+    dest: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}/whoami_sync_worker_router.js"
+    owner: "{{ matrix_user_name }}"
+    group: "{{ matrix_group_name }}"
+    mode: 0644
+  when: matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled | bool
+
+- name: Ensure matrix-synapse-reverse-proxy-companion njs path is removed when njs is disabled
+  ansible.builtin.file:
+    path: "{{ matrix_synapse_reverse_proxy_companion_njs_path }}"
+    state: absent
+  when: not matrix_synapse_reverse_proxy_companion_njs_enabled
+
 - name: Ensure matrix-synapse-reverse-proxy-companion nginx container image is pulled
  community.docker.docker_image:
    name: "{{ matrix_synapse_reverse_proxy_companion_container_image }}"
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/conf.d/matrix-synapse-reverse-proxy-companion.conf.j2
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/conf.d/matrix-synapse-reverse-proxy-companion.conf.j2
@@ -41,20 +41,48 @@
 	{% endfor %}
 {% endmacro %}

+{% macro render_locations_to_upstream_with_whoami_sync_worker_router(locations, upstream_name) %}
+	{% for location in locations %}
+	location ~ {{ location }} {
+		# Use auth_request to call the whoami sync worker router.
+		# The handler resolves the access token to a user identifier and returns it
+		# in the X-User-Identifier header, which is then used for upstream hashing.
+		auth_request /_whoami_sync_worker_router;
+		auth_request_set $user_identifier $sent_http_x_user_identifier;
+
+		{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_debug_headers_enabled %}
+		add_header X-Sync-Worker-Router-User-Identifier $user_identifier always;
+		add_header X-Sync-Worker-Router-Upstream $upstream_addr always;
+		{% endif %}
+
+		proxy_pass http://{{ upstream_name }}$request_uri;
+		proxy_http_version 1.1;
+		proxy_set_header Connection "";
+	}
+	{% endfor %}
+{% endmacro %}
+
 {% if matrix_synapse_reverse_proxy_companion_synapse_workers_enabled %}

+# Access token to user identifier mapping logic.
+# This is used for sticky routing to ensure requests from the same user are routed to the same worker.
+{% if not matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
+# Extracts the base64-encoded localpart from native Synapse access tokens.
+# Native Synapse tokens have the format: syt_<base64 localpart>_<random>_<crc>
+# See: https://github.com/element-hq/synapse/blob/1bddd25a85d82b2ef4a2a42f6ecd476108d7dd96/synapse/handlers/auth.py#L1448-L1459
 # Maps from https://tcpipuk.github.io/synapse/deployment/nginx.html#mapsconf
-# Client username from access token
+# Note: This only works with native Synapse tokens, not with MAS or other auth systems.
 map $arg_access_token $accesstoken_from_urlparam {
-  default                    $arg_access_token;
-  "~syt_(?<username>.*?)_.*" $username;
+  default                          $arg_access_token;
+  "~syt_(?<b64localpart>.*?)_.*"  $b64localpart;
 }
-# Client username from MXID
-map $http_authorization $mxid_localpart {
-  default                           $http_authorization;
-  "~Bearer syt_(?<username>.*?)_.*" $username;
-  ""                                $accesstoken_from_urlparam;
+map $http_authorization $user_identifier {
+  default                                 $http_authorization;
+  "~Bearer syt_(?<b64localpart>.*?)_.*"   $b64localpart;
+  ""                                      $accesstoken_from_urlparam;
 }
+{% endif %}
+
 # Whether to upgrade HTTP connection
 map $http_upgrade $connection_upgrade {
  default upgrade;
@@ -76,7 +104,7 @@ map $request_uri $room_name {
 	{% endif %}

 	{% if sync_workers | length > 0 %}
-	{{- render_worker_upstream('sync_workers_upstream', sync_workers, 'hash $mxid_localpart consistent;') }}
+	{{- render_worker_upstream('sync_workers_upstream', sync_workers, 'hash $user_identifier consistent;') }}
 	{% endif %}

 	{% if client_reader_workers | length > 0 %}
@@ -134,6 +162,17 @@ server {
 	proxy_max_temp_file_size 0;
 	proxy_set_header Host $host;

+	{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
+	# Internal location for whoami-based sync worker routing.
+	# This is called via auth_request from sync worker locations.
+	# The njs handler calls the whoami endpoint to resolve access tokens to usernames,
+	# then returns the username in the X-User-Identifier header for upstream hashing.
+	location = /_whoami_sync_worker_router {
+		internal;
+		js_content whoami_sync_worker_router.handleAuthRequest;
+	}
+	{% endif %}
+
 	{% if matrix_synapse_reverse_proxy_companion_synapse_workers_enabled %}
 		# Client-server overrides — These locations must go to the main Synapse process
 		location ~ {{ matrix_synapse_reverse_proxy_companion_client_server_main_override_locations_regex }} {
@@ -207,7 +246,11 @@ server {
 			# sync workers
 			# https://tcpipuk.github.io/synapse/deployment/workers.html
 			# https://tcpipuk.github.io/synapse/deployment/nginx.html#locationsconf
+			{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
+			{{ render_locations_to_upstream_with_whoami_sync_worker_router(matrix_synapse_reverse_proxy_companion_synapse_sync_worker_client_server_locations, 'sync_workers_upstream') }}
+			{% else %}
 			{{ render_locations_to_upstream(matrix_synapse_reverse_proxy_companion_synapse_sync_worker_client_server_locations, 'sync_workers_upstream') }}
+			{% endif %}
 		{% endif %}

 		{% if client_reader_workers | length > 0 %}
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/nginx.conf.j2
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/nginx.conf.j2
@@ -8,6 +8,12 @@
 # - various temp paths are changed to `/tmp`, so that a non-root user can write to them
 # - the `user` directive was removed, as we don't want nginx to switch users

+# load_module directives must be first or nginx will choke with:
+# > [emerg] "load_module" directive is specified too late.
+{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
+load_module modules/ngx_http_js_module.so;
+{% endif %}
+
 worker_processes {{ matrix_synapse_reverse_proxy_companion_worker_processes }};
 error_log /var/log/nginx/error.log warn;
 pid /tmp/nginx.pid;
@@ -22,7 +28,6 @@ events {
 {% endfor %}
 }

-
 http {
 	proxy_temp_path /tmp/proxy_temp;
 	client_body_temp_path /tmp/client_temp;
@@ -33,6 +38,16 @@ http {
 	include /etc/nginx/mime.types;
 	default_type application/octet-stream;

+	{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
+	js_path /njs/;
+	{% endif %}
+
+	{% if matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_enabled %}
+	# njs module for whoami-based sync worker routing
+	js_import whoami_sync_worker_router from whoami_sync_worker_router.js;
+	js_shared_dict_zone zone=whoami_sync_worker_router_cache:{{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_size_mb }}m;
+	{% endif %}
+
 	log_format main '$remote_addr - $remote_user [$time_local] "$request" '
 		'$status $body_bytes_sent "$http_referer" '
 		'"$http_user_agent" "$http_x_forwarded_for"';
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/njs/whoami_sync_worker_router.js.j2
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/templates/nginx/njs/whoami_sync_worker_router.js.j2
@@ -0,0 +1,202 @@
+#jinja2: lstrip_blocks: True
+// Whoami-based sync worker router
+//
+// This script resolves access tokens to usernames by calling the whoami endpoint.
+// Results are cached to minimize latency impact. The username is returned via the
+// X-User-Identifier header, which nginx captures and uses for upstream hashing.
+//
+// This works with any authentication system (native Synapse auth, MAS, etc.) because
+// Synapse handles token validation internally.
+//
+// Why auth_request instead of js_set?
+// -----------------------------------
+// A simpler approach would be to use js_set to populate a variable (e.g., $user_identifier)
+// and then use that variable in an upstream's `hash` directive. However, this doesn't work
+// because:
+//
+// 1. The whoami lookup requires an HTTP subrequest (ngx.fetch), which is asynchronous.
+// 2. js_set handlers must return synchronously - nginx's variable evaluation doesn't support
+//    async operations. Using async functions with js_set causes errors like:
+//    "async operation inside variable handler"
+//
+// The auth_request approach solves this by:
+// 1. Making a subrequest to an internal location that uses js_content (which supports async)
+// 2. Returning the user identifier via a response header (X-User-Identifier)
+// 3. Capturing that header with auth_request_set into $user_identifier
+// 4. Using $user_identifier in the upstream's hash directive for consistent routing
+
+const WHOAMI_URL = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_url | to_json }};
+const CACHE_TTL_MS = {{ (matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_cache_ttl_seconds * 1000) | to_json }};
+
+const LOGGING_ENABLED = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_enabled | to_json }};
+const LOGGING_TOKEN_LENGTH = {{ matrix_synapse_reverse_proxy_companion_whoami_sync_worker_router_logging_token_length | to_json }};
+
+function log(message) {
+    if (LOGGING_ENABLED) {
+        // Using WARN level because nginx's error_log is hardcoded to 'warn' and our logs won't be visible otherwise
+        ngx.log(ngx.WARN, 'whoami_sync_worker_router: ' + message);
+    }
+}
+
+// Truncate token for logging (show first X chars only for security)
+function truncateToken(token) {
+    if (!token || token.length <= LOGGING_TOKEN_LENGTH) {
+        return token;
+    }
+    return token.substring(0, LOGGING_TOKEN_LENGTH) + '...';
+}
+
+// Extract token from request (Authorization header or query parameter)
+function extractToken(r) {
+    // Try Authorization header first
+    const authHeader = r.headersIn['Authorization'];
+    if (authHeader && authHeader.startsWith('Bearer ')) {
+        return authHeader.substring(7);
+    }
+
+    // Fall back to access_token query parameter (deprecated in Matrix v1.11, but homeservers must support it)
+    if (r.args.access_token) {
+        return r.args.access_token;
+    }
+
+    return null;
+}
+
+// Extract localpart from user_id (e.g., "@alice:example.com" -> "alice")
+function extractLocalpart(userId) {
+    if (!userId || !userId.startsWith('@')) {
+        return null;
+    }
+    const colonIndex = userId.indexOf(':');
+    if (colonIndex === -1) {
+        return null;
+    }
+    return userId.substring(1, colonIndex);
+}
+
+// Get cached username for token
+function getCachedUsername(token) {
+    const cache = ngx.shared.whoami_sync_worker_router_cache;
+    if (!cache) {
+        return null;
+    }
+
+    const entry = cache.get(token);
+    if (entry) {
+        try {
+            const data = JSON.parse(entry);
+            if (data.expires > Date.now()) {
+                log('cache hit for token ' + truncateToken(token) + ' -> ' + data.username);
+                return data.username;
+            }
+            // Expired, remove from cache
+            log('cache expired for token ' + truncateToken(token));
+            cache.delete(token);
+        } catch (e) {
+            cache.delete(token);
+        }
+    }
+    return null;
+}
+
+// Cache username for token
+function cacheUsername(token, username) {
+    const cache = ngx.shared.whoami_sync_worker_router_cache;
+    if (!cache) {
+        return;
+    }
+
+    try {
+        const entry = JSON.stringify({
+            username: username,
+            expires: Date.now() + CACHE_TTL_MS
+        });
+        cache.set(token, entry);
+        log('cached token ' + truncateToken(token) + ' -> ' + username);
+    } catch (e) {
+        // Cache full or other error, log and continue
+        ngx.log(ngx.WARN, 'whoami_sync_worker_router: cache error: ' + e.message);
+    }
+}
+
+// Call whoami endpoint to get user_id
+async function lookupWhoami(token) {
+    log('performing whoami lookup for token ' + truncateToken(token));
+    try {
+        const response = await ngx.fetch(WHOAMI_URL, {
+            method: 'GET',
+            headers: {
+                'Authorization': 'Bearer ' + token
+            }
+        });
+
+        if (response.ok) {
+            const data = await response.json();
+            if (data.user_id) {
+                const localpart = extractLocalpart(data.user_id);
+                log('whoami lookup success: ' + data.user_id + ' -> ' + localpart);
+                return localpart;
+            }
+        } else if (response.status === 401) {
+            // Token is invalid/expired - this is expected for some requests
+            log('whoami lookup returned 401 (invalid/expired token)');
+            return null;
+        } else {
+            ngx.log(ngx.WARN, 'whoami_sync_worker_router: whoami returned status ' + response.status);
+        }
+    } catch (e) {
+        ngx.log(ngx.ERR, 'whoami_sync_worker_router: whoami failed: ' + e.message);
+    }
+
+    return null;
+}
+
+// Set response header with the user identifier for upstream hashing
+function setUserIdentifier(r, identifier) {
+    log('resolved user identifier: ' + identifier);
+    r.headersOut['X-User-Identifier'] = identifier;
+}
+
+// Main handler for auth_request subrequest.
+// Returns 200 with X-User-Identifier header containing the user identifier for upstream hashing.
+async function handleAuthRequest(r) {
+    const token = extractToken(r);
+
+    if (!token) {
+        // No token found (e.g., OPTIONS preflight requests don't include Authorization header).
+        // We return a random value to distribute these requests across workers.
+        // Returning an empty string would cause all no-token requests to hash to the same value,
+        // routing them all to a single worker.
+        // This doesn't affect the cache since we only cache token -> username mappings.
+        log('no token found in request, distributing randomly');
+        setUserIdentifier(r, '_no_token_' + Math.random());
+        r.return(200);
+        return;
+    }
+
+    // Check cache first
+    const cachedUsername = getCachedUsername(token);
+    if (cachedUsername) {
+        setUserIdentifier(r, cachedUsername);
+        r.return(200);
+        return;
+    }
+
+    // Perform whoami lookup
+    log('cache miss for token ' + truncateToken(token));
+    const username = await lookupWhoami(token);
+    if (username) {
+        cacheUsername(token, username);
+        setUserIdentifier(r, username);
+        r.return(200);
+        return;
+    }
+
+    // Whoami lookup failed, fall back to using the token itself for hashing.
+    // This still provides device-level sticky routing (same token -> same worker).
+    log('whoami lookup failed, falling back to token-based routing');
+    setUserIdentifier(r, token);
+    r.return(200);
+}
+
+export default { handleAuthRequest };
--- a/roles/custom/matrix-synapse-reverse-proxy-companion/templates/systemd/matrix-synapse-reverse-proxy-companion.service.j2
+++ b/roles/custom/matrix-synapse-reverse-proxy-companion/templates/systemd/matrix-synapse-reverse-proxy-companion.service.j2
@@ -36,6 +36,9 @@ ExecStartPre={{ devture_systemd_docker_base_host_command_docker }} create \
 			{% endif %}
 			--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_base_path }}/nginx.conf,dst=/etc/nginx/nginx.conf,ro \
 			--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_confd_path }},dst=/etc/nginx/conf.d,ro \
+			{% if matrix_synapse_reverse_proxy_companion_njs_enabled %}
+			--mount type=bind,src={{ matrix_synapse_reverse_proxy_companion_njs_path }},dst=/njs,ro \
+			{% endif %}
 			--label-file={{ matrix_synapse_reverse_proxy_companion_base_path }}/labels \
 			{% for arg in matrix_synapse_reverse_proxy_companion_container_arguments %}
 			{{ arg }} \