Compare revisions

8cb3e69c · 292904f7 · b295078a · ee00c5c9 · 032b9256 · 6a4ce4c3
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,6 +2,7 @@
 stages:
  - docker_build
  - test
+  - cleanup
 variables:
  IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
@@ -20,67 +21,110 @@ variables:
      --passwords=${TEST_DIR}/passwords.yml
      --num-hosts=1
      ${LIBVIRT:+-e libvirt.remote_host=${LIBVIRT#*@} -e libvirt.remote_user=${LIBVIRT%@*}}
-      -e ansible_cfg.defaults.strategy=mitogen_linear ${MITOGEN:+-e ansible_cfg.defaults.strategy_plugins=${MITOGEN}/ansible_mitogen/plugins/strategy}
      ${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}}
      $CREATE_ENV_VARS $BUILD_DIR
-    - with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-buster} up
+    - with-ssh-key floatup ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-bookworm} ${FLOATUP_ARGS} up
+    - ls -al /root/.ssh
+    - cat /root/.ssh/config
+    - cat $BUILD_DIR/hosts.yml
    - with-ssh-key ./test-driver init --no-vagrant $BUILD_DIR
    - with-ssh-key ./test-driver run $BUILD_DIR
  after_script:
    - with-ssh-key ./test-driver cleanup --no-vagrant $BUILD_DIR
-    - with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} down
+    - with-ssh-key floatup ${LIBVIRT:+--ssh $LIBVIRT} down
  variables:
    CREATE_ENV_VARS: ""
    TEST_DIR: ""
  tags: [ai3]
+  # Some artifacts may be missing, depending on the specific job.
  artifacts:
-    when: on_failure
+    when: always
    expire_in: 1 week
    name: "${CI_JOB_NAME}_${CI_COMMIT_REF_SLUG}_${CI_COMMIT_SHORT_SHA}"
+    reports:
+      dotenv: deploy.env
+      junit: pytest.xml
    paths:
+      - ".vmine_group_review*"
      - "${BUILD_DIR}/ansible.log"
      - "${BUILD_DIR}/logs"
 base_test:
  <<: *base_test
  variables:
-    CREATE_ENV_VARS: "-e config.float_debian_dist=buster"
+    VM_IMAGE: "bookworm"
    TEST_DIR: "test/base.ref"
-base_bullseye_test:
+trixie_test:
  <<: *base_test
+  # Need a more recent Ansible version, for Python 3.12 targets.
+  image: registry.git.autistici.org/ai3/docker/float-runner:trixie
  variables:
-    VM_IMAGE: "bullseye"
+    VM_IMAGE: "trixie"
-    CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
+    CREATE_ENV_VARS: "-e config.float_debian_dist=trixie"
    TEST_DIR: "test/base.ref"
 full_test:
  <<: *base_test
  variables:
-    CREATE_ENV_VARS: "-e config.float_debian_dist=buster"
+    VM_IMAGE: "bookworm"
    TEST_DIR: "test/full.ref"
+  rules:
+    - if: $CI_MERGE_REQUEST_ID == ''
-full_bullseye_test:
+full_test_review:
  <<: *base_test
+  after_script:
+    - with-ssh-key ./test-driver cleanup --no-vagrant $BUILD_DIR
  variables:
-    VM_IMAGE: "bullseye"
+    VM_IMAGE: "bookworm"
-    CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
+    CREATE_ENV_VARS: "-e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
+    FLOATUP_ARGS: "--state-file .vmine_group_review_$CI_MERGE_REQUEST_ID --ttl 6h --env deploy.env --dashboard-url https://vm.investici.org"
    TEST_DIR: "test/full.ref"
+  allow_failure: true
+  environment:
+    name: review/$CI_COMMIT_REF_SLUG
+    url: $VMINE_GROUP_URL
+    on_stop: stop_full_test_review
+    auto_stop_in: "6 hours"
+  rules:
+    - if: $CI_MERGE_REQUEST_ID
+stop_full_test_review:
+  stage: cleanup
+  dependencies: [full_test_review]
+  image: registry.git.autistici.org/ai3/docker/float-runner:master
+  script:
+    - with-ssh-key floatup --state-file .vmine_group_review_$CI_MERGE_REQUEST_ID ${LIBVIRT:+--ssh $LIBVIRT} down
+  allow_failure: true
+  environment:
+    name: review/$CI_COMMIT_REF_SLUG
+    action: stop
+  rules:
+    - if: $CI_MERGE_REQUEST_ID
+      when: manual
+#backup_test:
+#  <<: *base_test
+#  variables:
+#    VM_IMAGE: "bullseye"
+#    CREATE_ENV_VARS: "--additional-config test/backup.ref/config-backup.yml --playbook test/backup.ref/site.yml"
+#    TEST_DIR: "test/backup.ref"
 docker_build_and_release_tests:
  stage: docker_build
-  image: docker:latest
+  image: quay.io/podman/stable
-  services:
+  tags: [podman]
-    - docker:dind
  script:
-    - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.git.autistici.org
+    - echo -n "$CI_JOB_TOKEN" | podman login -u gitlab-ci-token --password-stdin $CI_REGISTRY
-    - cd test && docker build --build-arg ci_token=$CI_JOB_TOKEN --pull -t $IMAGE_TAG .
+    - cd test && podman build --build-arg ci_token=$CI_JOB_TOKEN --pull -t $IMAGE_TAG .
-    - docker tag $IMAGE_TAG $CI_REGISTRY_IMAGE:integration-test
+    - podman tag $IMAGE_TAG $CI_REGISTRY_IMAGE:integration-test
-    - docker push $CI_REGISTRY_IMAGE:integration-test
+    - podman push $CI_REGISTRY_IMAGE:integration-test
  only:
    changes:
      - test/float_integration_test/**
+      - test/Dockerfile
    refs:
      - master
--- a/README.md
+++ b/README.md
@@ -77,8 +77,8 @@ on the local machine using [Go](https://golang.org):
 ```shell
 sudo apt-get install golang
-go get -u git.autistici.org/ale/x509ca
+go install git.autistici.org/ale/x509ca@latest
-go get -u git.autistici.org/ale/ed25519gen
+go install git.autistici.org/ale/ed25519gen@latest
 export PATH=$PATH:$HOME/go/bin
 ```

--- a/ci/deploy.yml
+++ b/ci/deploy.yml
+# Build a simple float-based environment off a
+# services.yml/passwords.yml combination.
+#
+# Example usage:
+#
+# include:
+#   - project: ai3/float
+#     file: ci/deploy.yml
+#
+# variables:
+#   FLOATUP_ARGS: "--ssh user@jump.host"
+#
+# deploy:
+#   variables:
+#     SERVICES_FILE: my-services.yml
+#     PASSWORDS_FILE: my-passwords.yml
+#
+stages:
+  - deploy
+  - run
+# These are all configurable, globally and per-build.
+variables:
+  SERVICES_FILE: services.yml
+  PASSWORDS_FILE: passwords.yml
+  PLAYBOOK_FILE: site.yml
+  ROLES_PATH: "${CI_PROJECT_DIR}/roles"
+  DOMAIN: example.com
+  NUM_HOSTS: 1
+  VM_IMAGE: bullseye
+  VM_RAM: 2048
+  VM_CPU: 1
+  VM_TTL: "4h"
+  VM_TAG: default
+  VM_DASHBOARD_URL: "https://vm.investici.org"
+  FLOATUP_ARGS: ""
+  CREATE_ENV_ARGS: ""
+.deploy_template:
+  stage: deploy
+  image: registry.git.autistici.org/ai3/docker/float-runner:master
+  variables:
+    # Internal, do not change.
+    DEPLOY_ENV_FILE: "deploy-${VM_TAG}.env"
+    # This can be any temporary directory as long as it is unique
+    # (multiple jobs may be running off the same CI_PROJECT_DIR).
+    BUILD_DIR: "${CI_PROJECT_DIR}/env-${VM_TAG}-${CI_JOB_ID}"
+    VMINE_STATE_FILE: ".vmine_state_${CI_MERGE_REQUEST_ID}_${VM_TAG}"
+  before_script:
+    - mkdir -p $BUILD_DIR
+  script:
+    - (cd $BUILD_DIR && git clone --depth 1 https://git.autistici.org/ai3/float.git)
+    - >
+      $BUILD_DIR/float/float create-env
+      --domain=${DOMAIN}
+      --services=${BUILD_DIR}/float/services.core.yml
+      --services=${SERVICES_FILE}
+      --passwords=${BUILD_DIR}/float/passwords.yml.default
+      --passwords=${PASSWORDS_FILE}
+      --playbook=${BUILD_DIR}/float/playbooks/all.yml
+      --playbook=${PLAYBOOK_FILE}
+      --num-hosts=${NUM_HOSTS}
+      --roles-path=${ROLES_PATH}
+      -e ansible_cfg.defaults.strategy=mitogen_linear
+      -e config.docker_registry_url=${CI_REGISTRY}
+      -e config.docker_registry_username=${CI_REGISTRY_USER}
+      -e config.docker_registry_password=${CI_REGISTRY_PASSWORD}
+      ${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}}
+      ${CREATE_ENV_ARGS}
+      ${BUILD_DIR}
+    - with-ssh-key floatup $FLOATUP_ARGS --inventory $BUILD_DIR/hosts.yml --ram $VM_RAM --cpu $VM_CPU --image $VM_IMAGE --state-file $VMINE_STATE_FILE --env $DEPLOY_ENV_FILE --ttl $VM_TTL --dashboard-url $VM_DASHBOARD_URL up
+    - with-ssh-key $BUILD_DIR/float/test-driver init --no-vagrant $BUILD_DIR
+    - echo BUILD_DIR=$BUILD_DIR >> $DEPLOY_ENV_FILE
+  allow_failure: true
+  artifacts:
+    when: always
+    expire_in: "1 day"
+    reports:
+      dotenv: "$DEPLOY_ENV_FILE"
+    paths:
+      - "${BUILD_DIR}"
+      - "$VMINE_STATE_FILE"
+  environment:
+    name: "review/$CI_COMMIT_REF_SLUG"
+    url: "$VMINE_GROUP_URL"
+    auto_stop_in: "3 hours"
+.run_template:
+  stage: run
+  image: registry.git.autistici.org/ai3/docker/float-runner:master
+  script:
+    - cd $BUILD_DIR
+    - with-ssh-key ./float/float run -e docker_registry_password=$CI_REGISTRY_PASSWORD site.yml
+    - with-ssh-key ./float/float run -e docker_registry_password=$CI_REGISTRY_PASSWORD ./float/test/integration-test.yml
+  after_script:
+    - cd $BUILD_DIR
+    - with-ssh-key ./float/test-driver cleanup --no-vagrant .
+  artifacts:
+    when: always
+    paths:
+      - "${BUILD_DIR}/logs"
+.stop_deploy_template:
+  stage: run
+  image: registry.git.autistici.org/ai3/docker/float-runner:master
+  allow_failure: true
+  variables:
+    # Internal, do not change.
+    VMINE_STATE_FILE: ".vmine_state_${CI_MERGE_REQUEST_ID}_${VM_TAG}"
+  script:
+    - with-ssh-key floatup $FLOATUP_ARGS --state-file $VMINE_STATE_FILE down
+  environment:
+    name: "review/$CI_COMMIT_REF_SLUG"
+    action: stop
+deploy:
+  extends: .deploy_template
+  environment:
+    on_stop: "stop_deploy"
+run:
+  extends: .run_template
+stop_deploy:
+  extends: .stop_deploy_template
+  rules:
+    - when: manual
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@@ -11,13 +11,12 @@ stretch build host), and distribute it with alternative methods.
 These can normally be built with standard Debian development tools,
 such as *dpkg-buildpackage*.
-* [ai/sso](https://git.autistici.org/ai/sso)
 * [id/auth](https://git.autistici.org/id/auth)
-* [id/go-sso](https://git.autistici.org/id/go-sso)
+* [id/sso-server](https://git.autistici.org/id/sso-server)
 * [id/keystore](https://git.autistici.org/id/keystore)
 * [id/usermetadb](https://git.autistici.org/id/usermetadb)
-* [ale/zonetool](https://git.autistici.org/ale/zonetool)
+* [ai3/tools/zonetool](https://git.autistici.org/ai3/tools/zonetool)
 * [ai3/tools/cgroups-exporter](https://git.autistici.org/ai3/tools/cgroups-exporter)
 * [ai3/tools/runcron](https://git.autistici.org/ai3/tools/runcron)
 * [ai3/tools/audisp-json](https://git.autistici.org/ai3/tools/audisp-json)
@@ -28,7 +27,7 @@ such as *dpkg-buildpackage*.
 * [ai3/tools/tabacco](https://git.autistici.org/ai3/tools/tabacco)
 * [ai3/thirdparty/rsyslog-exporter](https://git.autistici.org/ai3/thirdparty/rsyslog-exporter)
-* [ai3/thirdparty/restic](https://git.autistici.org/ai3/thirdparty/restic)
+* [ai3/thirdparty/litestream](https://git.autistici.org/ai3/thirdparty/litestream)
 These are distributed via our own package repository at
 *deb.autistici.org*, which currently supports the *amd64* and *arm64*

--- a/docs/old/playbook.md
+++ b/docs/old/playbook.md
-Playbook
-===
-This document describes how to perform some common operations in
-*float*.
-## Applying changes
-### Rolling back the configuration
-If you are using a Git repository as your configuration source,
-*float* will keep track of which commit has been pushed to production
-last, and it will try to prevent you from pushing an old version of
-the configuration, failing immediately with an error. This is a simple
-check to make sure that people do not inadvertently roll back the
-production configuration by pushing from an out-of-date client.
-In most cases what you want to do in that case is to simply run *git
-pull* and bring your copy of the repository up to date. But if you
-really need to push an old version of the configuration in an
-emergency, you can do so by setting the *rollback* value to *true* on
-the command-line:
-```shell
-$ float run -e rollback=true site.yml
-```
-## For administrators
-### SSH Client Setup
-If you delegated SSH management to float by setting *enable_ssh* to
-true (see the [configuration reference](configuration.md)), float will
-create a SSH CA to sign all your host keys.
-You will find the public key for this CA in the
-*credentials/ssh/key.pub* file, it will be created the first time you
-run the "init-credentials" playbook.
-Assuming that all your target hosts share the same domain (so you can
-use a wildcard), you should add the following entry to
-*~/.ssh/known_hosts*:
-```
-@cert_authority *.example.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAA....
-```
-Since all logins happen as root, it may be convenient to also add a
-section to your *~/.ssh/config* file like the following:
-```
-Host *.example.com
-    User root
-```
-### Adding an admin account
-Adding a new administrator account is just a matter of editing the
-*admins* [configuration variable](configuration.md) and add a new
-entry to it.
-The first thing you will need is a hashed version of your
-password. The authentication service in float supports a number of
-legacy hashing schemes, including those supported by the system
-crypt(). The most secure hashing scheme supported is Argon2, and you
-can use our custom tool to generate a valid hash. To install it:
-```shell
-$ go install git.autistici.org/ai3/go-common/cmd/pwtool
-```
-Run the *pwtool* utility with your new password as an argument, as
-shown below:
-```shell
-# Do not save your password in the history of your shell
-$ export HISTIGNORE="./pwtool.amd64*"
-$ ./pwtool.amd64 PASSWORD
-```
-where PASSWORD is your desired password.
-It will output the hashed password.
-Then modify the YAML file *group_vars/all/admins.yml*. At the bare
-minimum the new account should have a *name*, *email*, *password* and
-*ssh_keys* attributes, e.g.:
-```yaml
---
-admins:
-  - name: "foo"
-    email: "foo@example.com"
-    password: "$a2$3$32768$4$abcdef...."
-    ssh_keys:
-      - "ssh-ed25519 AAAAC3Nza..."
-```
-Here above "ssh_keys:" needs to be populated with your public key,
-possibly stripped from the trailing user@hostname text (which may leak
-your personal information), and "password:" must be the hashed
-password you got from *pwtool* earlier.
-### Setting up OTP for an admin account
-First you need to manually generate the OTP secret on your computer:
-```shell
-$ SECRET=$(dd if=/dev/urandom bs=20 count=1 2>/dev/null | base32)
-$ echo $SECRET
-EVUVNACTWRAIERATIZUQA6YQ4WS63RN2
-```
-Install the package qrencode, and feed the OTP secret to it.
-For example with apt ["apt install qrencode" of course].
-```shell
-$ EMAIL="sub@krutt.org"
-$ qrencode -t UTF8 "otpauth://totp/example.com:${EMAIL}?secret=${SECRET}&issuer=example.com&algorithm=SHA1&digits=6&period=30"
-```
-and read the qrcode with your favourite app.
-Then add it to your user object in *group_vars/all/admins.yml* as the
-*totp_secret* attribute:
-```yaml
---
-admins:
-  - name: "foo"
-    totp_secret: "EVUVNACTWRAIERATIZUQA6YQ4WS63RN2"
-    ...
-```
-Finally, configure your TOTP client (app, YubiKey, etc.) with the same
-secret.
-Note that the secret is stored in cleartext in the git repository, so
-using a hardware token (U2F) is preferred.
-### Registering a U2F hardware token for an admin account
-In the *group_vars/all/admins.yml* file, you can add the
-*u2f_registrations* attribute to accounts, which is a list of the
-allowed U2F device registrations.
-To register a new device, you are going to need the *pamu2fcfg* tool
-(part of the *pamu2fcfg* Debian package). The following snippet should
-produce the two YAML attributes that you need to set:
-```shell
-$ pamu2fcfg --nouser --appid https://accounts.example.com \
-    | tr -d : \
-    | awk -F, '{print "key_handle: \"" $1 "\"\npublic_key: \"" $2 "\""}'
-```
-press enter, touch the key, copy the output and insert it in
-*group_vars/all/admins.yml*, the final results should look like:
-```yaml
---
-admins:
-  - name: "foo"
-    email: "foo@example.com"
-    password: "$a2$3$32768$4$abcdef...."
-    ssh_keys:
-      - "ssh-ed25519 AAAAC3Nza..."
-    u2f_registrations:
-      - key_handle: "r4wWRHgzJjl..."
-        public_key: "04803e4aff4..."
-```
-**NOTE**: the above will work with *pam_u2f* version 1.0.7, but it will *not*
-work with pam_u2f version 1.1.0 due to changes in the output format!
--- a/docs/quickstart.it.md
+++ b/docs/quickstart.it.md
@@ -31,8 +31,8 @@ dipendenze possono essere installate con questo comando:
 ```shell
 sudo apt install golang ansible vagrant
-go get -u git.autistici.org/ale/x509ca
+go install git.autistici.org/ale/x509ca@latest
-go get -u git.autistici.org/ale/ed25519gen
+go install git.autistici.org/ale/ed25519gen@latest
 export PATH=$PATH:$HOME/go/bin
 ```

--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -35,8 +35,8 @@ other dependencies can be installed with the following commands:
 ```shell
 sudo apt install golang ansible vagrant
-go get -u git.autistici.org/ale/x509ca
+go install git.autistici.org/ale/x509ca@latest
-go get -u git.autistici.org/ale/ed25519gen
+go install git.autistici.org/ale/ed25519gen@latest
 export PATH=$PATH:$HOME/go/bin
 ```
@@ -106,7 +106,7 @@ files for Ansible and Vagrant, with default values filled in by
  *create-env* automatically generates a default *admin* user, with
  password *password*.
-You can read the [configuration reference](configuration.md) for
+You can read the [configuration reference](reference.md) for
 details on the configuration file syntax and what the various options
 mean.

--- a/docs/reference.md
+++ b/docs/reference.md
@@ -311,23 +311,42 @@ datasets only once (on the service master host).
 ### Backups
 If provided with credentials for an external data repository, float
-will automatically make backups of your configured datasets. Float
+will automatically make backups of your configured datasets. These
-runs its own backup management system
+aren't just used for disaster recovery, but are an integral part of
-([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of
+float's service management approach: when a service is scheduled on a
-Restic, which adds additional metadata to Restic snapshots to map
+new host, for instance as a result of a re-scheduling, float will
-float datasets.
+attempt to automatically restore the associated datasets from their
+backups. Restores can of course also be triggered manually whenever
+necessary.
-When a service is scheduled on a new host, for instance as a result of
+Float offers two backup mechanisms for datasets:
-a re-scheduling, float will attempt to restore the associated datasets
-from their backups. While this is not a practical failover solution
-for complex services, we've found it works pretty well for a category
-of services with "important, but small - can afford to lose one day of
-changes" datasets that is quite common and useful in itself. For these
-services, running with num_instances=1 and counting on the
-backup/restore data move mechanism might provide sufficient
-availability and reliability.
-Restores can of course also be triggered manually whenever necessary.
+* For bulk data, it can use its own backup management system
+  ([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of
+  Restic, which adds additional metadata to Restic snapshots to map
+  float datasets. This can be used as a primitive failover solution
+  for services that aren't "important" enough to afford their own
+  distributed storage abstractions, and where losing up to one day of
+  changes is tolerable. An alternative, "live" solution, that would
+  favor correctness over availability, is also in the works. This
+  backup mechanism is *extensible* to understand the structure and
+  metadata of specific services' entities and accounts, if necessary.
+* There are a number of instances, in float, of a specific category of
+  service, single-hosted small API services that run off a simple
+  SQLite database, some of which are critical to float's operation
+  (for example the backup metadata service itself). For this
+  particular use case, float supports backups with
+  [Litestream](https://litestream.io), an asynchronous replication
+  solution for SQLite, that offers point-in-time restore capabilities
+  (less than 1 second of data loss window) in case of disaster or when
+  the service is rescheduled.
+  Litestream requires an S3-compatible backend (Minio, AWS, etc).
+Note that float does not, in its default configuration, provide the
+data storage services used by its backup mechanisms. These are treated
+as third-party (external) resources.
 ### Volumes
@@ -549,7 +568,7 @@ Ansible roles to configure them.
 Note that, in its default setup, float will naturally assume a
 two-tier service topology, with "frontend" hosts handling traffic
 routing in a stateless fashion, and "backend" hosts running the actual
-services. The default *services.yml.default* service description file
+services. The default *services.default.yml* service description file
 literally expects the *frontend* and *backend* Ansible groups to be
 defined in your inventory. However, these are just roles, and there is
 nothing inherent in float that limits you to this kind of topology.
@@ -1556,6 +1575,8 @@ provided:
    specify a regex (with a capture group) to extract back the host
    name from the target; the default regex will extract the short
    host name from URLs and host:port targets.
+* (optionally) a *scrape_interval* if for some reason it should be
+  different than the default *prometheus_probe_scrape_interval*.
 So, in the context of the previous example, if we wanted to probe
 another float service called *myservice*, which hypothetically serves
@@ -1573,6 +1594,27 @@ prometheus_additional_blackbox_probers:
    target_regex: "http://\\1:2020"
 ```
+### Customizing alert timeouts for additional blackbox probes
+The Prometheus configuration for the default *float* blackbox probes
+is appropriate for high-frequency, high-accuracy probes (with 10s
+polling and a 5m alert timeout). This is not going to be appropriate
+for all use cases, such as more complex probes that require less
+frequent polling.
+Float provides a way to configure alert timeouts on a *prober*
+(i.e. float service) basis, by using the optional
+*prober_alert_timeout* attribute in the service description
+metadata. For instance, to set a 30 minute alert timeout in the
+context of the previous example, the services.yml file should be
+modified:
+```yaml
+my-prober:
+  ...
+  prober_alert_timeout: 30m
+```
 ## Log Collection and Analysis
 Logs are forwarded by all machines to a set of (one or more)
@@ -1789,7 +1831,7 @@ pairs that define group variables.
 ### Groups
 While you can define any host groups you want, the default service
-configuration in float (*services.yml.default*) expects you to define
+configuration in float (*services.default.yml*) expects you to define
 at least two:
 * *frontend*, for the public-facing reverse proxy hosts
@@ -1808,19 +1850,21 @@ Variables can be Ansible variables: SSH parameters, etc., usually with
 an *ansible_* prefix. But some host variables have special meaning for
 float:
-`ip` (mandatory) is the IPv4 address of this host that other hosts
+`ips` (mandatory) is the list of IP addresses of this host that other
-(i.e. internal services) should use to reach it
+hosts (i.e. internal services) should use to reach it. You can specify
+one or more IP addresses, IPv4 or IPv6. Note that this is a **list**.
+For legacy reasons, float still also understands the `ip` (singular)
+attribute, which is expected to be a single IPv4 address, but this
+support will eventually be retired, so on new inventories you should
+use the `ips` list attribute.
-`ip6` (optional) is the IPv6 version of the above
+`public_ips` (optional) is the list of IP addresses for this host that
+will be advertised in the public-facing DNS zones. If unset it
-`public_ip` (optional) is the IPv4 address that will be advertised in
+defaults to `ips`.
-the public-facing DNS zones, if unset it defaults to `ip`
-`public_ip6` (optional) is the IPv6 version of the above (if unset,
-it will default to `ip6`)
 `ip_<name>` (optional) defines the IPv4 address for this host on the
-overlay network called *name*
+overlay network called *name*. Note that as opposed to `ips` this is
+not a list but a single IPv4 address.
 `groups` (optional) is a list of Ansible groups that this host should
 be a member of
@@ -1880,12 +1924,12 @@ Service metadata is encoded as a dictionary of *service name*:
 Metadata for services that are part of the core infrastructure ships
 embedded with this repository, so when writing your own `services.yml`
 file, you only need to add your services to it. You should include the
-*services.yml.default* file shipped with the float source, which
+*services.default.yml* file shipped with the float source, which
 defines all the built-in services:
 ```yaml
 include:
-  - "/path/to/float/services.yml.default"
+  - "/path/to/float/services.default.yml"
 ```
 The `include` directive is special: it does not define a service, but
@@ -1979,21 +2023,24 @@ service.
 Each entry in the *monitoring_endpoints* list can have the following
 attributes:
-`job_name`: Job name in Prometheus, defaults to the service name.
-`type` (deprecated): Selects the service discovery mechanism used by
-Prometheus to find the service endpoints. This can only have the value
-*static*, which is also the default.
 `port`: Port where the `/metrics` endpoint is exported.
 `scheme`: HTTP scheme for the service endpoint. The default is *https*.
+`healthcheck_http_method`: HTTP method to use for checking job status. The default is *HEAD* to query the endpoint without transferring all the metric data. Not all endpoints support this method, so if the probe fails set it to a method that it does support (worst case: *GET*).
 `metrics_path`: Path for metrics if different from the default of `/metrics`.
 `labels`: An optional dictionary of key/value labels to set for this
 target (they will be added to all metrics scraped from it).
+`scrape_interval`: Optionally override the scrape interval for this
+target.
+The Prometheus *job* labels for service targets will be automatically
+generated by *float* to include the service name and the endpoint
+port.
 ### Traffic routing
 Services can define *public* HTTP and TCP endpoints, that will be
@@ -2041,6 +2088,12 @@ using single sign-on, allowing access only to administrators (members
 of the *admins* group). This is quite useful for admin web interfaces
 of internal services that do not support SSO integration of their own.
+`enable_api_proxy`: If true, place the service behind authentication
+using a mechanism more appropriate for non-interactive APIs (HTTP
+Basic Authentication using Application-Specific Passwords). Only members
+of the *admins* group will have access. When this option is set, you
+also need to specify a unique `auth_service` to be used for ASPs.
 #### HTTP (All domains)
 `horizontal_endpoints`: List of HTTP endpoints exported by the
@@ -2067,10 +2120,10 @@ attributes, all required:
 `name`: Name of the endpoint.
-`port`: Port where the service is running. Also the port that will be
+`port`: Port where the service is running.
-publicly exported (at least in the current implementation), which
-unfortunately means that the service itself shouldn't be running on
+`public_port`: Port that should be exposed to the Internet. Defaults
-*frontend* nodes.
+to `port` if unset.
 `use_proxy_protocol`: When true, enable the HAProxy proxy protocol for
 the service, to propagate the original client IP to the backends.
@@ -2126,6 +2179,19 @@ option automatically sets *drop_capabilities* to false.
 drop all capabilities for this container. Otherwise, the capability
 set will be controlled by systemd.
+`egress_policy` (default: *allow-all*): selects the network egress
+policy for this container. This allows broad control over network
+connections made by the process running in the container, and it can
+take one of the following values:
+* *allow-all*, allows all traffic
+* *internal*, only allows traffic to float's internal private networks
+  (necessary for containers serving public_endpoints, of course)
+* *none*, only allows traffic to localhost
+These policies are implemented using BPF filters, which at the moment
+are quite simplistic, hence the limited configurability.
 ### Non-container services
 `systemd_services`: List of systemd service units that are associated
@@ -2212,6 +2278,37 @@ The LVs are created in the volume specified by the `volumes_vg` global
 configuration variable, which by default is *vg0*. The VG must already
 exist, float will not attempt to create it.
+### Annotations
+`annotations`: Dictionary with service-specific annotations
+Annotations are manually curated metadata associated with the service,
+intended for debugging purposes. This is data meant for humans to
+consume, with the idea of helping the operators understand and debug
+your services and their interconnections.
+Annotations are for now only displayed on the float admin dashboard.
+`summary`: A short summary (description) of the service.
+#### Dependency graphs
+`dependencies`: A list of additional service dependencies.
+Float can automatically compute part of the dependency graph between
+your services, at least insofar as the structure of *public_endpoints*
+is concerned. Since this data can be quite useful in understanding the
+structure of a service, it is possible to extend the dependency graph
+manually by specifying additional edges (representing the dependencies
+between services).
+Edges of the dependency graphs are specified as objects with `client`
+and `server` attributes, identifying a specific container or systemd
+unit in either the current service or a different one. If you're
+referring to an entity within the same service, you can just use its
+name, while for external services the syntax is
+*service-name*/*entity-name* (e.g. "log-collector/elasticsearch").
 ### Examples
 Let's look at some example *services.yml* files:
@@ -2361,8 +2458,11 @@ each a dictionary with the following attributes:
  documentation](https://git.autistici.org/id/auth/blob/master/README.md#password-encoding).
 * `totp_secret` - TOTP secret for 2FA, base32-encoded
 * `ssh_keys` - a list of strings representing SSH public keys
-* `u2f_registrations` - a list of objects representing U2F token
+* `webauthn_registrations` - a list of objects representing
-  registrations
+  WebAuthN(U2F) token registrations
+* `u2f_registrations` - a list of objects representing legacy U2F
+  token registrations, only supported for old registrations created
+  before the switch to WebAuthN. Don't add new entries to this list.
 ### Authentication and SSO
@@ -2400,9 +2500,12 @@ attributes that specify static DNS entries that will be added to
 `nginx_cache_keys_mem` is the memory size of the key buffer for the
 global NGINX HTTP cache.
-`nginx_cache_fs_size` is the maximum on-disk size of the NGINX HTTP
+`nginx_cache_custom_params` are additional parameters for customizing
-cache (note that NGINX might use as much as twice what specified here,
+the *proxy_cache_path* NGINX configuration directive for the global
-depending on expiration policy).
+cache. The most important attribute you might want to set is possibly
+*max_size*, which controls the maximum size of the on-disk cache (note
+that NGINX might use as much as twice what specified, depending on
+expiration policy).
 `nginx_global_custom_headers` - a dictionary of {header: value} pairs
 corresponding to HTTP headers that must be set on *every* response.
@@ -2417,8 +2520,8 @@ tuples used for redirecting top-level domains to specific destinations
 service which is normally part of the log-collector infrastructure. As
 this is a large Java daemon with significant memory requirements, it
 is often useful to disable it for testing environments. Note that in
-this case one should also import *services.yml.no-elasticsearch*
+this case one should import *services.core.yml*
-instead of the default *services.yml.default*.
+instead of the default *services.default.yml*.
 `es_log_keep_days` is a dictionary that specifies the retention time
 for the various log types, in days. The default is `{ audit: 60,
@@ -2444,6 +2547,10 @@ instances should scrape their targets (default 10s).
 `prometheus_lts_scrape_interval` sets how often the long-term
 Prometheus instances should scrape the primary ones (default 1m).
+`prometheus_probe_scrape_interval` controls the default
+scrape_interval setting for all blackbox probes, and it just defaults
+to the value of prometheus_scrape_interval if unset.
 `prometheus_external_targets` allows adding additional targets to Prometheus
 beyond those that are described by the service metadata. It is a list of entries
 with *name*, *targets* attributes. Optionally, you may specify a *scheme*
@@ -2483,6 +2590,15 @@ to be notified about resolved alerts (default False).
 ### Third-party services
+#### ACME
+Float's ACME certificate generation service does not require any
+configuration, as it will automatically generate a Letsencrypt
+account. It is possible, however, to tell it to use a specific account
+by providing it with a private key:
+`acme_private_key` - ACME private key, in PEM format
 #### Private Docker registry
 You can have float use a private Docker registry by providing it with
@@ -2543,16 +2659,53 @@ but it will still be active and functional (via *amtool*).
 #### Backups
 To configure the backup system, you're going to need credentials for
-an external repository. The backup system
+the third-party (external) data storage services. While it is possible
-uses [restic](https://restic.net), so check its documentation for the
+to run a production service *without* backups configured, note that
-URI syntax.
+the cluster's functionality will be incomplete unless at least a
+Litestream backend is configured.
+##### Bulk backup (Restic)
+`backup_repository_uri` - URI of the global (shared) restic
+repository. Though Restic supports [numerous
+backends](https://restic.readthedocs.io/en/stable/030_preparing_a_new_repo.html),
+float works best with Restic's own [REST
+Server](https://github.com/restic/rest-server).
+`backup_repository_restic_password` - password used to encrypt the
+restic repository.
+##### Asynchronous SQLite replication (Litestream)
-`backup_repository_uri` - URI of the global (shared) restic repository
+Litestream requires a S3-compatible API to store its SQLite WAL
+snapshots.
-`backup_repository_restic_password` - the password used to encrypt
+`backup_litestream_config` is the object that configures the
-the restic repository.
+Litestream replica target, and it corresponds to the "replica" field
+of the Litestream configuration, so you can check the [Litestream
+documentation](https://litestream.io/reference/config/#replica-settings)
+for reference. The most important fields to set are `endpoint` (the
+URL of the storage service API), and `bucket` (the name of the bucket
+to use). The *path* attribute will be automatically set by float,
+based on the dataset name.
+`backup_litestream_credentials` is a dictionary of environment
+variables to configure credentials for access to the backend storage
+service. Keys will depend on which type of API is being used, but for
+the default *s3* type they should be `LITESTREAM_ACCESS_KEY_ID` and
+`LITESTREAM_SECRET_ACCESS_KEY`.
+An example of a (fictional) litestream configuration:
+```yaml
+backup_litestream_config:
+  type: s3
+  endpoint: "https://backup.service:9000/"
+  bucket: "mybackups"
+backup_litestream_credentials:
+  LITESTREAM_ACCESS_KEY_ID: "minio"
+  LITESTREAM_SECRET_ACCESS_KEY: "miniopassword"
+```
 # Operations
@@ -2568,8 +2721,8 @@ unsupported.
 ```shell
 sudo apt install golang ansible
-go get git.autistici.org/ale/x509ca
+go install git.autistici.org/ale/x509ca@latest
-go get git.autistici.org/ale/ed25519gen
+go install git.autistici.org/ale/ed25519gen@latest
 export PATH=$PATH:$HOME/go/bin
 ```
@@ -2654,7 +2807,7 @@ There are some minimal requirements on how your Ansible environment
 should be set up for this to work:
 * you must have a *group_vars/all* directory (this is where we'll
-  write the autogenerated application credentials file *secrets.yml*q)
+  write the autogenerated application credentials file *secrets.yml*)
 * you must include float's *playbooks/all.yml* playbook file from the
  toolkit source directory at the beginning of your playbook
 * you should use the *float* wrapper instead of running
@@ -3097,7 +3250,7 @@ Install the package qrencode, and feed the OTP secret to it.
 For example with apt ["apt install qrencode" of course].
 ```shell
-$ EMAIL="sub@krutt.org"
+$ EMAIL="foo@example.com"
 $ qrencode -t UTF8 "otpauth://totp/example.com:${EMAIL}?secret=${SECRET}&issuer=example.com&algorithm=SHA1&digits=6&period=30"
 ```
@@ -3123,21 +3276,23 @@ using a hardware token (U2F) is preferred.
 ### Registering a U2F hardware token for an admin account
 In the *group_vars/all/admins.yml* file, you can add the
-*u2f_registrations* attribute to accounts, which is a list of the
+*webauthn_registrations* attribute to accounts, which is a list of the
-allowed U2F device registrations.
+allowed WebAuthN/U2F device registrations.
-To register a new device, you are going to need the *pamu2fcfg* tool
+To register a new device, you are going to need to install another
-(part of the *pamu2fcfg* Debian package). The following snippet should
+small custom tool:
-produce the two YAML attributes that you need to set:
+[webauthn-cred](https://git.autistici.org/ai3/tools/webauthn-cred). Follow
+its installation instructions to obtain the *webauthn-cred* binary,
+then invoke it to make a new registration:
 ```shell
-$ pamu2fcfg --nouser --appid https://accounts.example.com \
+$ webauthn-cred --rpid accounts.example.com
-    | tr -d : \
-    | awk -F, '{print "key_handle: \"" $1 "\"\npublic_key: \"" $2 "\""}'
 ```
-press enter, touch the key, copy the output and insert it in
+follow the instructions, copy the output and insert it in
-*group_vars/all/admins.yml*, the final results should look like:
+*group_vars/all/admins.yml* as a new item in the
+*webauthn_registrations* attribute of your user. The final results
+should look like:
 ```yaml
 ---
@@ -3147,14 +3302,11 @@ admins:
    password: "$a2$3$32768$4$abcdef...."
    ssh_keys:
      - "ssh-ed25519 AAAAC3Nza..."
-    u2f_registrations:
+    webauthn_registrations:
      - key_handle: "r4wWRHgzJjl..."
-        public_key: "04803e4aff4..."
+        public_key: "ajgh73-31bc..."
 ```
-**NOTE**: the above will work with *pam_u2f* version 1.0.7, but it will *not*
-work with pam_u2f version 1.1.0 due to changes in the output format!
 ### Upgrading Debian version on target hosts
 Float generally targets the current Debian *stable* distribution, but
@@ -3168,13 +3320,25 @@ process:
 * Set *float_debian_dist* to the new codename (e.g. "buster") in your
  group_vars/all configuration.
 * Run *float*, which will install the correct APT sources for the new
-  release.
+  release and upgrade the servers.
-* Run *apt dist-upgrade* manually or via Ansible. This part is not
+* Reboot the servers into the new kernels.
-  automated yet due to the large variety in possible scenarios.
-* Run *float* again: it will now detect that the distribution has
+If you want more control over this process (Debian upgrades have been
-  changed and reconfigure packages as needed.
+event-less for a while now, but it's not always been the case) you
+can of course run the upgrade manually.
+### Decommissioning a host
+When turning down a host, it is necessary, at some point, to
+reschedule the services that were there onto some other hosts. To
+achieve a smooth transition, this is best done while the host is still
+available.
+To do this, set the *turndown* attribute to *true* in the inventory
+for the host you want to turn down, and then run *float* once more.
+This should safely reschedule all services, and remove them from the
+target host. It is then possible to simply shut down the target host
+and wipe its data.
 # Example scenarios
@@ -3201,7 +3365,7 @@ available) for the service.
 ```yaml
 include:
-  - "/path/to/float/services.yml.default"
+  - "/path/to/float/services.default.yml"
 ok:
  scheduling_group: backend
  num_instances: 1
@@ -3276,7 +3440,7 @@ The services.yml file:
 ```yaml
 include:
-  - "/path/to/float/services.yml.default"
+  - "/path/to/float/services.default.yml"
 videoconf:
  scheduling_group: videoconf
  num_instances: all

--- a/docs/reference.pdf
+++ b/docs/reference.pdf
--- a/float
+++ b/float
@@ -51,7 +51,7 @@ Vagrant.configure(2) do |config|
    libvirt.memory = {{ ram }}
    libvirt.random_hostname = true
    libvirt.cpu_mode = 'host-passthrough'
-    libvirt.volume_cache = 'unsafe'
+    libvirt.disk_driver :cache => 'unsafe'
 {% if libvirt.remote_host %}
    libvirt.host = "{{ libvirt.remote_host }}"
    libvirt.username = "{{ libvirt.remote_user }}"
@@ -65,7 +65,7 @@ Vagrant.configure(2) do |config|
  hosts.each do |hostname, hostvars|
    config.vm.define hostname do |m|
      m.vm.hostname = hostname
-      m.vm.network "private_network", ip: hostvars["ip"], libvirt__dhcp_enabled: false, libvirt__network_name: network_name
+      m.vm.network "private_network", ip: hostvars["ips"][0], libvirt__dhcp_enabled: false, libvirt__network_name: network_name
    end
  end
 end
@@ -116,14 +116,18 @@ VVSaq+sWqN+ugjpj9sJ++/O1uSiUPNZdIwIBAg==
    'services.yml': '''---
 include:
 {% if services_yml_path %}
-  - "{{ services_yml_path | relpath(targetdir) }}"
+{% for p in services_yml_path %}
+  - "{{ p | relpath(targetdir) }}"
+{% endfor %}
 {% else %}
-  - "{{ srcdir | relpath(targetdir) }}/services.yml.no-elasticsearch"
+  - "{{ srcdir | relpath(targetdir) }}/services.core.yml"
 {% endif %}
 ''',
    'passwords.yml': '''---
 {% if passwords_yml_path %}
- include: "{{ passwords_yml_path | relpath(targetdir) }}"
+{% for p in passwords_yml_path %}
+- include: "{{ p | relpath(targetdir) }}"
+{% endfor %}
 {% else %}
 - include: "{{ srcdir | relpath(targetdir) }}/passwords.yml.default"
 {% endif %}
@@ -142,8 +146,8 @@ DEFAULT_VARS = {
    # Paths, some set by command-line options.
    'srcdir': SRCDIR,
    'targetdir': None,
-    'services_yml_path': None,
+    'services_yml_path': [],
-    'passwords_yml_path': None,
+    'passwords_yml_path': [],
    'playbooks': [],
    # Memory for the virtual machines (MB).
@@ -158,13 +162,7 @@ DEFAULT_VARS = {
    # Ansible inventory (hosts are created dynamically).
    'inventory': {
        'hosts': {},
-        'group_vars': {
+        'group_vars': {},
-            'vagrant': {
-                'ansible_user': 'vagrant',
-                'ansible_become': True,
-                'ansible_ssh_private_key_file': '~/.vagrant.d/insecure_private_key',
-            },
-        },
    },
    # Ansible configuration.
@@ -177,10 +175,12 @@ DEFAULT_VARS = {
            'callback_plugins': '{{ srcdir | relpath(targetdir) }}/plugins/callback',
            'force_handlers': True,
            'log_path': 'ansible.log',
+            'retry_files_enabled': False,
+            'interpreter_python': '/usr/bin/python3',
            'nocows': 1,
            'display_skipped_hosts': False,
-            'callback_whitelist': 'float_ci',
+            'callbacks_enabled': 'float_ci',
            'stdout_callback': 'float_ci',
            'host_key_checking': False,
            'forks': 50,
@@ -203,7 +203,7 @@ DEFAULT_VARS = {
        'domain_public': [],
        'testing': True,
-        'float_debian_dist': 'bullseye',
+        'float_debian_dist': 'bookworm',
        'net_overlays': [{
            'name': 'vpn0',
            'network': '192.168.13.0/24',
@@ -296,7 +296,7 @@ def _random_hosts(num_hosts, extra_memberships):
        hostvars = {
            'name': hostname,
            'ansible_host': f'{net}.{i+10}',
-            'ip': f'{net}.{i+10}',
+            'ips': [f'{net}.{i+10}'],
            'ip_vpn0': f'192.168.13.{i+10}',
        }
        hostgroups = ['vagrant']
@@ -341,7 +341,7 @@ def _render_skel(target_dir, ctx):
 def command_create_env(path, services, passwords, playbooks,
                       roles_path, num_hosts, additional_host_groups,
                       additional_configs, ram, domain, infra_domain,
-                       extra_vars):
+                       become, extra_vars):
    all_vars = DEFAULT_VARS
    # Set paths in the internal config.
@@ -350,6 +350,20 @@ def command_create_env(path, services, passwords, playbooks,
    all_vars['passwords_yml_path'] = passwords
    all_vars['playbooks'] = playbooks
+    # Set connection-related user parameters.
+    if become == 'root':
+        all_vars['inventory']['group_vars']['vagrant'] = {
+            'ansible_user': 'root',
+            'ansible_become': False,
+        }
+    else:
+        all_vars['inventory']['group_vars']['vagrant'] = {
+            'ansible_user': become,
+            'ansible_become': True,
+            # For legacy compatibility reasons.
+            'ansible_ssh_private_key_file': '~/.vagrant.d/insecure_private_key',
+        }
    # Extend the Ansible roles_path.
    if roles_path:
        for rpath in roles_path.split(':'):
@@ -359,14 +373,13 @@ def command_create_env(path, services, passwords, playbooks,
    # Catch ValueError to handle parsing errors for composite-valued
    # options and print a friendly message.
    try:
-        all_vars['inventory']['hosts'] = _random_hosts(
+        extra_memberships = _parse_additional_host_groups(additional_host_groups)
-            num_hosts,
-            _parse_additional_host_groups(additional_host_groups),
-        )
    except ValueError:
        print('Unable to parse additional-host-group spec', file=sys.stderr)
        return 1
+    all_vars['inventory']['hosts'] = _random_hosts(num_hosts, extra_memberships)
    all_vars['ram'] = ram
    all_vars['config']['domain_public'] = [domain]
    all_vars['config']['domain'] = (
@@ -436,7 +449,7 @@ def command_run(config, playbooks,
        print('Running playbook %s...' % (arg,))
-        os.environ['LC_ALL'] = 'C'
+        os.environ['LC_ALL'] = 'C.UTF-8'
        _fix_ansible_vault_password_file()
        cmd = [os.getenv('ANSIBLE_PLAYBOOK', 'ansible-playbook'),
               '-i', config]
@@ -528,9 +541,11 @@ memberships, using the --additional-host-group command-line option.
        help='infrastructural domain to use (default: "infra." + domain)')
    create_env_parser.add_argument(
        '--services', metavar='FILE',
+        action='append', default=[],
        help='your custom services.yml')
    create_env_parser.add_argument(
        '--passwords', metavar='FILE',
+        action='append', default=[],
        help='your custom passwords.yml')
    create_env_parser.add_argument(
        '--playbook', metavar='FILE',
@@ -542,6 +557,9 @@ memberships, using the --additional-host-group command-line option.
    create_env_parser.add_argument(
        '--ram', metavar='MB', type=int, default=3072,
        help='RAM for each VM when using --vagrant (default: 3072)')
+    create_env_parser.add_argument(
+        '--become', metavar='USER', default='root',
+        help='ansible_user, disable ansible_become if "root"')
    create_env_parser.add_argument(
        '--additional-host-group', metavar='GROUP=HOST1[,HOST2...]',
        dest='additional_host_groups',

--- a/playbooks/all.yml
+++ b/playbooks/all.yml
@@ -43,3 +43,6 @@
  roles:
    - float-infra-sso-server
+- hosts: assets
+  roles:
+    - float-infra-assetmon
--- a/playbooks/apt-upgrade.yml
+++ b/playbooks/apt-upgrade.yml
@@ -2,6 +2,9 @@
 - hosts: all
  tasks:
+    - copy:
+        src: ../roles/float-base/files/apt/deb_autistici_org.gpg
+        dest: /usr/share/keyrings/deb.autistici.org.gpg
    - apt:
        update_cache: yes
        upgrade: "yes"

--- a/playbooks/frontend.yml
+++ b/playbooks/frontend.yml
@@ -13,10 +13,10 @@
  roles:
    - float-infra-dns
- hosts: admin_dashboard
+- hosts: service_dashboard
  gather_facts: no
  roles:
-    - float-infra-admin-dashboard
+    - float-infra-service-dashboard
 - hosts: acme
  gather_facts: no

--- a/playbooks/init-credentials.yml
+++ b/playbooks/init-credentials.yml
@@ -28,7 +28,6 @@
        - dnssec
        - ssh
        - sso
-        - x509
    # First of all, generate secrets from the passwords.yml file.
    - name: Initialize secrets
@@ -50,8 +49,17 @@
    - name: Generate SSO credentials
      local_action: ed25519 privkey="{{ credentials_dir }}/sso/secret.key" pubkey="{{ credentials_dir }}/sso/public.key"
-    - name: Generate global DH params
+    - set_fact:
-      local_action: command openssl dhparam -out "{{ credentials_dir }}/x509/dhparam" "{{ dhparam_bits | default('2048') }}" creates="{{ credentials_dir }}/x509/dhparam"
+        default_x509_ca_list:
+          - {tag: x509}
+    - name: Create X509 CA directory
+      local_action: file path="{{ credentials_dir }}/{{ item.tag }}" state=directory
+      loop: "{{ x509_ca_list | default(default_x509_ca_list) }}"
    - name: Generate the X509 CA certificate
-      local_action: x509_ca ca_subject="{{ x509_ca_subject | default('CN=Service CA') }}" ca_cert_path="{{ credentials_dir }}/x509/ca.pem" ca_key_path="{{ credentials_dir }}/x509/ca_private_key.pem"
+      local_action: x509_ca ca_subject="{{ item.subject | default('CN=Service CA') }}" ca_cert_path="{{ credentials_dir }}/{{ item.tag }}/ca.pem" ca_key_path="{{ credentials_dir }}/{{ item.tag }}/ca_private_key.pem"
+      loop: "{{ x509_ca_list | default(default_x509_ca_list) }}"
+    - name: Generate global DH params
+      local_action: command openssl dhparam -out "{{ credentials_dir }}/x509/dhparam-{{ dhparam_bits | default('2048') }}" "{{ dhparam_bits | default('2048') }}" creates="{{ credentials_dir }}/x509/dhparam-{{ dhparam_bits | default('2048') }}"
--- a/plugins/action/float_authorized_keys.py
+++ b/plugins/action/float_authorized_keys.py
+# Prepare a SSH authorized_keys file content using float 'admins'.
+from ansible.plugins.action import ActionBase
+class ActionModule(ActionBase):
+    TRANSFERS_FILES = False
+    def run(self, tmp=None, task_vars=None):
+        admins = self._templar.template('{{ admins }}')
+        authorized_keys = []
+        # For each SSH key, add a comment with the owner's username.
+        for entry in admins:
+            username = entry['name']
+            if 'ssh_keys' not in entry:
+                continue
+            for key in entry['ssh_keys']:
+                key_without_comment = ' '.join(key.split()[:2])
+                key_with_comment = f'{key_without_comment} {username}\n'
+                authorized_keys.append(key_with_comment)
+        result = super(ActionModule, self).run(tmp, task_vars)
+        result['ansible_facts'] = {'float_authorized_keys': ''.join(authorized_keys)}
+        result['changed'] = False
+        return result
--- a/plugins/action/float_container_options.py
+++ b/plugins/action/float_container_options.py
+from ansible.plugins.action import ActionBase
+TMPFS_FLAGS = 'tmpfs-mode=01777'
+DEFAULT_TMPFS_SIZE = '64M'
+class ActionModule(ActionBase):
+    TRANSFERS_FILES = False
+    # Options to set the container environment.
+    def _environment_options(self, service, container):
+        service_name = service['name']
+        hostname = self._templar.template('{{ inventory_hostname }}')
+        domain = self._templar.template('{{ domain }}')
+        env = {
+            'FLOAT_SERVICE': f'{service_name}.{domain}',
+            'FLOAT_INSTANCE_NAME': f'{hostname}.{service_name}.{domain}',
+            'FLOAT_CONTAINER_IMAGE': container['image'],
+            'FLOAT_CONTAINER_NAME': f'{service_name}-{container["name"]}',
+        }
+        if 'env' in container:
+            env.update(container['env'])
+        options = []
+        for key, value in sorted(env.items()):
+            options.append(f'--env={key}={value}')
+        return options
+    # Options for volumes (tmpfs, bind mounts).
+    def _mount_options(self, service, container):
+        options = []
+        add_tmpfs = True
+        def _bind(src, dst):
+            options.append(f'--mount=type=bind,source={src},destination={dst}')
+        def _tmpfs(dst, flags=None):
+            opt = f'--mount=type=tmpfs,destination={dst},{TMPFS_FLAGS}'
+            if flags:
+                opt += f',{flags}'
+            options.append(opt)
+        if container.get('readonly', True):
+            options.append('--read-only')
+            add_tmpfs = False
+        for vol in container.get('volumes', []):
+            for src, dst in sorted(vol.items()):
+                if dst == '/tmp':
+                    add_tmpfs = False
+                if src == 'tmpfs':
+                    _tmpfs(dst, f'tmpfs-size={DEFAULT_TMPFS_SIZE}')
+                elif src.startswith('tmpfs/'):
+                    sz = src[6:]
+                    _tmpfs(dst, f'tmpfs-size={sz}')
+                else:
+                    _bind(src, dst)
+        _tmpfs('/run', 'tmpfs-size=16M,exec=true,notmpcopyup')
+        _bind('/dev/log', '/dev/log')
+        _bind('/etc/credentials/system', '/etc/ssl/certs')
+        if add_tmpfs:
+            _tmpfs('/tmp', f'tmpfs-size={DEFAULT_TMPFS_SIZE},notmpcopyup')
+        for creds in service.get('service_credentials', []):
+            creds_name = creds['name']
+            ca_tag = creds.get('ca_tag', 'x509')
+            creds_path = f'/etc/credentials/{ca_tag}/{creds_name}'
+            _bind(creds_path, creds_path)
+        return options
+    # Network options (ports).
+    def _network_options(self, container):
+        options = ['--network=host']
+        ports = []
+        if 'ports' in container:
+            ports = container['ports']
+        elif 'port' in container:
+            ports = [container['port']]
+        for port in sorted(ports):
+            options.append(f'--expose={port}')
+        return options
+    def run(self, tmp=None, task_vars=None):
+        service = self._task.args['service']
+        container = self._task.args['container']
+        options = []
+        options.extend(self._environment_options(service, container))
+        options.extend(self._mount_options(service, container))
+        options.extend(self._network_options(container))
+        is_root = container.get('root')
+        if container.get('drop_capabilities', not is_root):
+            options.append('--security-opt=no-new-privileges')
+            options.append('--cap-drop=all')
+        if 'docker_options' in container:
+            options.extend(container['docker_options'].split())
+        result = super().run(tmp, task_vars)
+        result['options'] = options
+        result['changed'] = False
+        return result
--- a/plugins/action/tinc_host_conf.py
+++ b/plugins/action/tinc_host_conf.py
-# Generate a host configuration file for tinc (fetching the public key
-# from the remote host), and store the result in an Ansible fact.
-from ansible.plugins.action import ActionBase
-from ansible.errors import AnsibleFileNotFound
-from ansible.module_utils._text import to_text
-HOST_TEMPLATE = '''
-Address = {{ ip }}
-{% if ip6 is defined %}Address = {{ ip6 }}{% endif %}
-Port = {{ tinc_config.port | default('655') }}
-Cipher = {{ tinc_config.cipher | default('aes-128-cbc') }}
-Digest = {{ tinc_config.digest | default('sha256') }}
-Compression = {{ tinc_config.compression | default('0') }}
-PMTU = {{ tinc_config.pmtu | default('1460') }}
-Subnet = {{ tinc_host_subnet }}
-{{ tinc_host_public_key }}
-'''
-class ActionModule(ActionBase):
-    TRANSFERS_FILES = False
-    def _cmd(self, task_vars, args, creates=None):
-        args = {
-            '_raw_params': ' '.join(args),
-            'creates': creates,
-        }
-        return self._execute_module(
-            module_name='command',
-            module_args=args,
-            task_vars=task_vars,
-            wrap_async=False)
-    def run(self, tmp=None, task_vars=None):
-        overlay = self._task.args['overlay']
-        subnet = self._templar.template('{{ ip_%s }}/32' % overlay)
-        # Find the overlay configuration by scanning the 'net_overlays'
-        # configuration variable, which is a list - it would be simpler with
-        # a dictionary.
-        net_overlays = self._templar.template('{{ net_overlays|default([]) }}')
-        overlay_config = {'name': overlay}
-        for n in net_overlays:
-            if n['name'] == overlay:
-                overlay_config = n
-                break
-        result = super(ActionModule, self).run(tmp, task_vars)
-        # Fetch the host public key.
-        pubkey = self._cmd(task_vars, [
-            '/bin/cat', '/etc/tinc/%s/rsa_key.pub' % overlay])['stdout']
-        if not pubkey:
-            result['failed'] = True
-            result['msg'] = "could not fetch host public key"
-            return result
-        # Generate the template, adding some custom variables of our own.
-        self._templar._available_variables['tinc_host_subnet'] = subnet
-        self._templar._available_variables['tinc_host_public_key'] = pubkey
-        self._templar._available_variables['tinc_config'] = overlay_config
-        data = self._templar.do_template(HOST_TEMPLATE,
-                                         preserve_trailing_newlines=True,
-                                         escape_backslashes=False)
-        result['ansible_facts'] = {'tinc_host_config': data}
-        result['changed'] = False
-        return result
--- a/plugins/action/x509_ca.py
+++ b/plugins/action/x509_ca.py
--- a/plugins/inventory/float.py
+++ b/plugins/inventory/float.py
@@ -38,6 +38,9 @@ DEFAULT_SERVICE_CREDENTIALS = [
    {
        'name': 'auth-server',
    },
+    {
+        'name': 'assetmon-client',
+    },
 ]
@@ -172,30 +175,31 @@ def _host_groups(name, inventory, assignments=None):
 # Return all host IP addresses for the specified overlay.
 def _host_net_overlay_addrs(name, inventory, overlay):
    if overlay == 'public':
-        keys = ('ip', 'ip6')
+        return inventory['hosts'][name]['public_ips']
-    else:
-        keys = ('ip_' + overlay,)
    addrs = []
-    for k in keys:
+    key = 'ip_' + overlay
-        v = inventory['hosts'][name].get(k)
+    if key in inventory['hosts'][name]:
-        if v:
+        addrs.append(inventory['hosts'][name][key])
-            addrs.append(v)
    return addrs
 # Return all host IP addresses, on all interfaces.
 def _host_addrs(name, inventory):
-    return [
+    addrs = []
-        v for k, v in inventory['hosts'][name].items()
+    for ip in inventory['hosts'][name]['ips']:
-        if k == 'ip' or k == 'ip6' or k.startswith('ip_')]
+        addrs.append(ip)
+    for k, v in inventory['hosts'][name].items():
+        if k.startswith('ip_'):
+            addrs.append(v)
+    return addrs
 def _host_dns_map(name, inventory):
    dns = {}
+    dns[name] = inventory['hosts'][name]['ips']
    for k, v in inventory['hosts'][name].items():
-        if k == 'ip' or k == 'ip6':
+        if k.startswith('ip_'):
-            dns.setdefault(name, []).append(v)
-        elif k.startswith('ip_'):
            dns.setdefault(name + '.' + k[3:], []).append(v)
    return dns
@@ -278,6 +282,16 @@ def _global_dns_map(inventory):
    return dns
+# Return the hosts that are not available for scheduling, as a
+# Python set.
+def _unavailable_hosts(inventory):
+    unavail = set()
+    for name, values in inventory['hosts'].items():
+        if values.get('turndown'):
+            unavail.add(name)
+    return unavail
 # Build a group -> hosts map out of an inventory.
 def _build_group_map(inventory, assignments=None):
    group_map = {}
@@ -318,6 +332,7 @@ def _build_public_endpoints_map(services):
                'name': upstream_name,
                'service_name': service_name,
                'port': pe['port'],
+                'enable_api_proxy': pe.get('enable_api_proxy', False),
                'enable_sso_proxy': pe.get('enable_sso_proxy', False),
                'sharded': pe.get('sharded', False),
            }
@@ -348,6 +363,14 @@ def _build_public_endpoints_map(services):
    return upstreams, endpoints
+def _build_public_endpoint_port_map(services):
+    endpoints_by_port = {}
+    for svc in services.values():
+        for pe in svc.get('public_endpoints', []):
+            endpoints_by_port[pe['port']] = pe['name']
+    return endpoints_by_port
 # Build the map of upstreams for 'horizontal' (well-known etc) HTTP
 # public endpoints.
 #
@@ -363,6 +386,7 @@ def _build_horizontal_upstreams_map(services):
                'name': upstream_name,
                'service_name': service_name,
                'port': ep['port'],
+                'enable_api_proxy': False,
                'enable_sso_proxy': False,
                'sharded': False,
            }
@@ -487,7 +511,10 @@ class Assignments(object):
        return str(self._fwd)
    @classmethod
-    def _available_hosts(cls, service, group_map):
+    def _available_hosts(cls, service, group_map, service_hosts_map,
+                         unavailable_hosts={}):
+        if 'schedule_with' in service:
+            return service_hosts_map[service['schedule_with']]
        scheduling_groups = ['all']
        if 'scheduling_group' in service:
            scheduling_groups = [service['scheduling_group']]
@@ -495,8 +522,10 @@ class Assignments(object):
            scheduling_groups = service['scheduling_groups']
        available_hosts = set()
        for g in scheduling_groups:
+            if g not in group_map:
+                raise Exception(f'The scheduling_group "{g}" is not defined in inventoy')
            available_hosts.update(group_map[g])
-        return list(available_hosts)
+        return list(available_hosts.difference(unavailable_hosts))
    @classmethod
    def schedule(cls, services, inventory):
@@ -509,24 +538,32 @@ class Assignments(object):
        """
        service_hosts_map = {}
        service_master_map = {}
+        unavailable_hosts = _unavailable_hosts(inventory)
        group_map = _build_group_map(inventory)
        host_occupation = collections.defaultdict(int)
        # Iterations should happen over sorted items for reproducible
        # results. The sort function combines the 'scheduling_order'
-        # attribute (default -1) and the service name.
+        # attribute (default -1), the presence of the 'schedule_with'
+        # attribute, and the service name.
        def _sort_key(service_name):
-            return (services[service_name].get('scheduling_order', -1), service_name)
+            return (services[service_name].get('scheduling_order', -1),
+                    1 if 'schedule_with' in services[service_name] else 0,
+                    service_name)
        for service_name in sorted(services.keys(), key=_sort_key):
            service = services[service_name]
-            available_hosts = cls._available_hosts(service, group_map)
+            available_hosts = cls._available_hosts(service, group_map,
+                                                   service_hosts_map,
+                                                   unavailable_hosts)
            num_instances = service.get('num_instances', 'all')
            if num_instances == 'all':
                service_hosts = sorted(available_hosts)
            else:
                service_hosts = sorted(_binpack(
                    available_hosts, host_occupation, num_instances))
+            if not service_hosts:
+                raise Exception(f'No hosts available to schedule service {service_name}')
            service_hosts_map[service_name] = service_hosts
            for h in service_hosts:
                host_occupation[h] += 1
@@ -551,10 +588,34 @@ def _any_attribute_set(services, attr):
    return False
+# Pre-process inventory entries, to normalize host variables and
+# provide defaults (thus simplifying the jinja template logic).
+def _preprocess_inventory(inventory):
+    for host in inventory['hosts'].values():
+        # Set 'ips' if the legacy variables are set.
+        if 'ips' not in host:
+            host['ips'] = []
+            if 'ip' in host:
+                host['ips'].append(host['ip'])
+            if 'ip6' in host:
+                host['ips'].append(host['ip6'])
+        # Same for 'public_ips'.
+        if 'public_ips' not in host:
+            host['public_ips'] = []
+            if 'public_ip' in host:
+                host['public_ips'].append(host['public_ip'])
+            if 'public_ip6' in host:
+                host['public_ips'].append(host['public_ip6'])
+        # Default public_ips to ips.
+        if not host['public_ips']:
+            host['public_ips'] = host['ips']
 # Run the scheduler, and return inventory and groups for Ansible.
 def run_scheduler(config):
    services = config['services']
    inventory = config['inventory']
+    _preprocess_inventory(inventory)
    assignments = Assignments.schedule(services, inventory)
    # Augment all data structures with autogenerated and
@@ -583,6 +644,7 @@ def run_scheduler(config):
        # The following variables are just used for debugging purposes (dashboards).
        'float_service_assignments': assignments._fwd,
        'float_service_masters': assignments._masters,
+        'float_http_endpoints_by_port': _build_public_endpoint_port_map(services),
    })
    # Set the HTTP frontend configuration on the 'frontend' group.
@@ -590,8 +652,6 @@ def run_scheduler(config):
    inventory['group_vars'].setdefault('frontend', {}).update({
        'float_enable_http_frontend': _any_attribute_set(
            services, 'public_endpoints'),
-        'float_enable_tcp_frontend': _any_attribute_set(
-            services, 'public_tcp_endpoints'),
        'float_http_upstreams': http_upstreams,
        'float_http_endpoints': http_endpoints,
    })

--- a/roles/float-base-auth-server/templates/auth-server.yml.j2
+++ b/roles/float-base-auth-server/templates/auth-server.yml.j2
 ---
+{% set all_ips = hostvars.values() | rejectattr('ips', 'undefined') | map(attribute='ips') | flatten | sort %}
 backends_dir: /etc/auth-server/backends.d
 services_dir: /etc/auth-server/services.d
 services: {}
+webauthn:
+  rp_id: "{{ webauthn_rp_id }}"
+  rp_origin: "https://{{ webauthn_rp_id }}"
+  rp_display_name: "{{ webauthn_rp_display_name | default(webauthn_rp_id) }}"
 {% if 'user-meta-server' in services %}
 user_meta_server:
  url: "https://user-meta-server.{{ domain }}:5505"
@@ -25,9 +32,9 @@ rate_limits:
        value: "127.0.0.1"
      - key: ip
        value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
      - key: ip
-        value: "{{ hostvars[h]['ip'] }}"
+        value: "{{ ip }}"
 {% endfor %}
  # Per-IP rate limiter specific to account recovery, with stricter limits.
@@ -40,9 +47,9 @@ rate_limits:
        value: "127.0.0.1"
      - key: ip
        value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
      - key: ip
-        value: "{{ hostvars[h]['ip'] }}"
+        value: "{{ ip }}"
 {% endfor %}
  # Blacklist users with too many failed account recovery attempts.
@@ -73,9 +80,9 @@ rate_limits:
        value: "127.0.0.1"
      - key: ip
        value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
      - key: ip
-        value: "{{ hostvars[h]['ip'] }}"
+        value: "{{ ip }}"
 {% endfor %}
 {% if 'auth-cache' in services %}
No results found