9 Commits

Author SHA1 Message Date
Slavi Pantaleev
014380eecd Upgrade Traefik (v3.6.8-1 -> v3.6.8-2) 2026-02-12 01:04:06 +02:00
Slavi Pantaleev
a77a8753d9 Derive Synapse post-start delay from Traefik's providersThrottleDuration
After Synapse's systemd health check passes, Traefik still needs
providers.providersThrottleDuration to register routes. Derive the
post-start delay from this setting (+1s for healthcheck polling gap)
instead of using a hardcoded value. Defaults to 0 when no Traefik
reverse proxy is used.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 00:54:46 +02:00
Slavi Pantaleev
9569633164 Upgrade Traefik (v3.6.8-0 -> v3.6.8-1) 2026-02-12 00:48:13 +02:00
Slavi Pantaleev
9d9e9e9177 Use docker inspect for Synapse systemd health check and lower health interval
Switch the systemd ExecStartPost health check from docker exec + curl
to polling docker inspect for container health status. This piggybacks
on the container image's built-in HEALTHCHECK instead of duplicating it.

Also add a configurable container health interval (5s for Traefik setups,
15s otherwise) to speed up startup readiness detection without affecting
non-Traefik deployments.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 00:13:02 +02:00
Slavi Pantaleev
bcddeda5df Make traefik-certs-dumper require the Traefik service to avoid race condition
When both services restart simultaneously (e.g. in all-at-once mode),
Traefik may momentarily truncate or reinitialize acme.json, causing
the certs dumper to read an empty file and panic. By adding
Requires/After on the Traefik service, the certs dumper only starts
after Traefik is fully ready and acme.json is stable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 00:11:28 +02:00
Slavi Pantaleev
59e70b8ca9 Add systemd-healthcheck to Synapse systemd service in an effort to increase reliability (of Synapse-dependant services)
Previously, we had a 10-second magical delay.

Now we first do a healthcheck to figure out when it really is up.
Then, we do the same 10-second magical delay to account for the time it
may take for a reverse-proxy (like Traefik) to pick up Synapse's routes.
2026-02-11 23:32:33 +02:00
Slavi Pantaleev
f8815c0bb9 Upgrade systemd_service_manager (v2.0.0-0 -> v2.0.0-1) 2026-02-11 23:31:13 +02:00
Slavi Pantaleev
2fad873b42 Make addon systemd services depend on the homeserver systemd service as well, not just on Traefik
Addons typically access the homeserver via Traefik, but requests
ultimately lead to the homeserver and it'd better be up or Traefik would
serve a "404 Not Found" error.

This is an attempt (one of many pieces) to make services more reliable,
especially when `devture_systemd_service_manager_service_restart_mode: all-at-once` is used
(which is the default).
2026-02-11 23:27:09 +02:00
Slavi Pantaleev
294cd109fd Upgrade Traefik (v3.6.7-1 -> v3.6.8-0) 2026-02-11 23:26:13 +02:00
4 changed files with 85 additions and 12 deletions

View File

@@ -212,7 +212,20 @@ matrix_homeserver_app_service_config_files_auto: |
matrix_addons_homeserver_container_network: "{{ matrix_playbook_reverse_proxy_container_network if matrix_playbook_internal_matrix_client_api_traefik_entrypoint_enabled else matrix_homeserver_container_network }}"
matrix_addons_homeserver_client_api_url: "{{ ('http://' + matrix_playbook_reverse_proxy_hostname + ':' + matrix_playbook_internal_matrix_client_api_traefik_entrypoint_port | string) if matrix_playbook_internal_matrix_client_api_traefik_entrypoint_enabled else matrix_homeserver_container_url }}"
matrix_addons_homeserver_systemd_services_list: "{{ ([traefik_identifier + '.service'] if matrix_playbook_reverse_proxy_type == 'playbook-managed-traefik' else []) if matrix_playbook_internal_matrix_client_api_traefik_entrypoint_enabled else matrix_homeserver_systemd_services_list }}"
matrix_addons_homeserver_systemd_services_list: |
{{
(
matrix_homeserver_systemd_services_list
+
(
[traefik_identifier + '.service']
if matrix_playbook_reverse_proxy_type == 'playbook-managed-traefik'
else []
)
)
if matrix_playbook_internal_matrix_client_api_traefik_entrypoint_enabled
else matrix_homeserver_systemd_services_list
}}
########################################################################
# #
@@ -4452,6 +4465,12 @@ matrix_synapse_password_config_enabled: "{{ not matrix_synapse_matrix_authentica
matrix_synapse_register_user_script_matrix_authentication_service_path: "{{ matrix_authentication_service_bin_path }}/register-user"
# After Synapse's systemd health check passes, the reverse proxy still needs time to
# discover the container and register its routes. We derive this delay from Traefik's
# providers.providersThrottleDuration setting (+1s grace for our healthcheck polling interval),
# so it stays in sync automatically.
matrix_synapse_systemd_service_post_start_delay_seconds: "{{ (traefik_config_providers_providersThrottleDuration_seconds | int + 1) if matrix_playbook_reverse_proxy_type in ['playbook-managed-traefik', 'other-traefik-container'] else 0 }}"
######################################################################
#
# /matrix-synapse
@@ -5650,6 +5669,16 @@ traefik_certs_dumper_gid: "{{ matrix_user_gid }}"
traefik_certs_dumper_ssl_path: "{{ traefik_ssl_dir_path if traefik_enabled else '' }}"
# We make the certs dumper require the Traefik service (not just docker.service),
# because when both restart simultaneously (e.g. in all-at-once mode), Traefik may
# momentarily truncate or reinitialize acme.json, causing the certs dumper to read
# an empty file and panic. By requiring Traefik, the certs dumper only starts after
# Traefik is fully ready and acme.json is stable.
traefik_certs_dumper_systemd_required_services_list_auto: |
{{
([traefik_identifier + '.service'] if traefik_enabled else [])
}}
traefik_certs_dumper_container_image_registry_prefix_upstream: "{{ matrix_container_global_registry_prefix_override if matrix_container_global_registry_prefix_override else traefik_certs_dumper_container_image_registry_prefix_upstream_default }}"
########################################################################

View File

@@ -72,13 +72,13 @@
version: v1.4.1-0
name: systemd_docker_base
- src: git+https://github.com/devture/com.devture.ansible.role.systemd_service_manager.git
version: v2.0.0-0
version: v2.0.0-1
name: systemd_service_manager
- src: git+https://github.com/devture/com.devture.ansible.role.timesync.git
version: v1.1.0-1
name: timesync
- src: git+https://github.com/mother-of-all-self-hosting/ansible-role-traefik.git
version: v3.6.7-1
version: v3.6.8-2
name: traefik
- src: git+https://github.com/mother-of-all-self-hosting/ansible-role-traefik-certs-dumper.git
version: v2.10.0-4

View File

@@ -322,6 +322,22 @@ matrix_synapse_container_labels_public_metrics_middleware_basic_auth_users: ''
# another.label="here"
matrix_synapse_container_labels_additional_labels: ''
# Specifies how often the container health check will run.
#
# The Synapse container image ships with a default HEALTHCHECK (curl to /health)
# with an interval of 15s, timeout of 5s, and start period of 5s.
#
# For Traefik-based setups, it's important that the interval is short,
# because the interval value also specifies the "initial wait time".
# This is a Docker (moby) bug: https://github.com/moby/moby/issues/33410
# Without a successful healthcheck, Traefik will not register the service for reverse-proxying.
# A shorter interval also lets our systemd ExecStartPost health check
# (see matrix_synapse_systemd_healthcheck_enabled) detect readiness faster at startup.
#
# For non-Traefik setups, we use the default healthcheck interval (15s) to decrease overhead.
matrix_synapse_container_health_interval_seconds: "{{ 5 if matrix_synapse_container_labels_traefik_enabled else 15 }}"
matrix_synapse_container_health_interval: "{{ matrix_synapse_container_health_interval_seconds }}s"
# A list of extra arguments to pass to the container
# Also see `matrix_synapse_container_arguments`
matrix_synapse_container_extra_arguments: []
@@ -358,14 +374,37 @@ matrix_synapse_goofys_systemd_required_services_list_default: "{{ [devture_syste
matrix_synapse_goofys_systemd_required_services_list_auto: []
matrix_synapse_goofys_systemd_required_services_list_custom: []
# Controls how long to sleep for after starting the matrix-synapse container.
#
# Delaying, so that the homeserver can manage to fully start and various services
# that depend on it (`matrix_synapse_systemd_required_services_list` and `matrix_synapse_systemd_wanted_services_list`)
# may only start after the homeserver is up and running.
#
# This can be set to 0 to remove the delay.
matrix_synapse_systemd_service_post_start_delay_seconds: 10
# Controls the post-start health check in the systemd service.
# When enabled, ExecStartPost polls Docker's container health status via `docker inspect`,
# keeping the service in "activating (start-post)" state until Synapse is ready.
# Services with After=matrix-synapse.service will properly wait.
# This relies on the container image's built-in HEALTHCHECK (curl to /health),
# with the interval controlled by matrix_synapse_container_health_interval.
matrix_synapse_systemd_healthcheck_enabled: true
matrix_synapse_systemd_healthcheck_max_retries: 60
matrix_synapse_systemd_healthcheck_interval_seconds: 1
# The command used for the health check in ExecStartPost.
# Polls `docker inspect` for the container's health status until it reports "healthy".
matrix_synapse_systemd_healthcheck_command: >-
{{ devture_systemd_docker_base_host_command_sh }} -c
'for i in $(seq 1 {{ matrix_synapse_systemd_healthcheck_max_retries }}); do
echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: checking container health status..";
status=$( {{ devture_systemd_docker_base_host_command_docker }} inspect --format={{ '"{{' }}.State.Health.Status{{ '}}"' }} matrix-synapse 2>/dev/null);
if [ "$status" = "healthy" ]; then echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: passed" && exit 0; fi;
echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: not ready yet (status: $status), retrying in {{ matrix_synapse_systemd_healthcheck_interval_seconds }}s..";
sleep {{ matrix_synapse_systemd_healthcheck_interval_seconds }};
done; echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: failed after {{ matrix_synapse_systemd_healthcheck_max_retries }} attempts"; exit 1'
# Controls how long to sleep for after the systemd health check passes.
# Even after Synapse is healthy, the reverse proxy (e.g. Traefik) needs time to discover
# the container and register its routes. Traefik waits `providers.providersThrottleDuration`
# (see https://doc.traefik.io/traefik/v3.3/providers/overview/#providersprovidersthrottleduration)
# before applying new configuration from Docker events.
# Without this delay, services depending on Synapse may encounter 404 errors
# when connecting through the reverse proxy.
# This value is meant to be wired to the Traefik throttle duration by the playbook's group vars.
matrix_synapse_systemd_service_post_start_delay_seconds: 0
matrix_synapse_in_container_python_packages_path: "/usr/local/lib/python3.13/site-packages"

View File

@@ -33,6 +33,7 @@ ExecStartPre={{ devture_systemd_docker_base_host_command_docker }} create \
--read-only \
--tmpfs=/tmp:rw,noexec,nosuid,size={{ matrix_synapse_tmp_directory_size_mb }}m \
--network={{ matrix_synapse_container_network }} \
--health-interval={{ matrix_synapse_container_health_interval }} \
{% if matrix_synapse_container_client_api_host_bind_port %}
-p {{ matrix_synapse_container_client_api_host_bind_port }}:{{ matrix_synapse_container_client_api_port }} \
{% endif %}
@@ -69,8 +70,12 @@ ExecStartPre={{ devture_systemd_docker_base_host_command_docker }} network conne
ExecStart={{ devture_systemd_docker_base_host_command_docker }} start --attach matrix-synapse
{% if matrix_synapse_systemd_healthcheck_enabled %}
ExecStartPost={{ matrix_synapse_systemd_healthcheck_command }}
{% endif %}
{% if matrix_synapse_systemd_service_post_start_delay_seconds > 0 %}
ExecStartPost=-{{ matrix_host_command_sleep }} {{ matrix_synapse_systemd_service_post_start_delay_seconds }}
ExecStartPost=-{{ matrix_host_command_sleep }} {{ matrix_synapse_systemd_service_post_start_delay_seconds }}
{% endif %}
ExecStop=-{{ devture_systemd_docker_base_host_command_sh }} -c '{{ devture_systemd_docker_base_host_command_docker }} stop -t {{ devture_systemd_docker_base_container_stop_grace_time_seconds }} matrix-synapse 2>/dev/null || true'