Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
Loading items

Target

Select target project
  • ai3/float
  • micah/float
2 results
Select Git revision
Loading items
Show changes
Commits on Source (469)
Showing
with 737 additions and 417 deletions
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
stages: stages:
- docker_build - docker_build
- test - test
- cleanup
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
...@@ -20,67 +21,110 @@ variables: ...@@ -20,67 +21,110 @@ variables:
--passwords=${TEST_DIR}/passwords.yml --passwords=${TEST_DIR}/passwords.yml
--num-hosts=1 --num-hosts=1
${LIBVIRT:+-e libvirt.remote_host=${LIBVIRT#*@} -e libvirt.remote_user=${LIBVIRT%@*}} ${LIBVIRT:+-e libvirt.remote_host=${LIBVIRT#*@} -e libvirt.remote_user=${LIBVIRT%@*}}
-e ansible_cfg.defaults.strategy=mitogen_linear ${MITOGEN:+-e ansible_cfg.defaults.strategy_plugins=${MITOGEN}/ansible_mitogen/plugins/strategy}
${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}} ${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}}
$CREATE_ENV_VARS $BUILD_DIR $CREATE_ENV_VARS $BUILD_DIR
- with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-buster} up - with-ssh-key floatup ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-bookworm} ${FLOATUP_ARGS} up
- ls -al /root/.ssh
- cat /root/.ssh/config
- cat $BUILD_DIR/hosts.yml
- with-ssh-key ./test-driver init --no-vagrant $BUILD_DIR - with-ssh-key ./test-driver init --no-vagrant $BUILD_DIR
- with-ssh-key ./test-driver run $BUILD_DIR - with-ssh-key ./test-driver run $BUILD_DIR
after_script: after_script:
- with-ssh-key ./test-driver cleanup --no-vagrant $BUILD_DIR - with-ssh-key ./test-driver cleanup --no-vagrant $BUILD_DIR
- with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} down - with-ssh-key floatup ${LIBVIRT:+--ssh $LIBVIRT} down
variables: variables:
CREATE_ENV_VARS: "" CREATE_ENV_VARS: ""
TEST_DIR: "" TEST_DIR: ""
tags: [ai3] tags: [ai3]
# Some artifacts may be missing, depending on the specific job.
artifacts: artifacts:
when: on_failure when: always
expire_in: 1 week expire_in: 1 week
name: "${CI_JOB_NAME}_${CI_COMMIT_REF_SLUG}_${CI_COMMIT_SHORT_SHA}" name: "${CI_JOB_NAME}_${CI_COMMIT_REF_SLUG}_${CI_COMMIT_SHORT_SHA}"
reports:
dotenv: deploy.env
junit: pytest.xml
paths: paths:
- ".vmine_group_review*"
- "${BUILD_DIR}/ansible.log" - "${BUILD_DIR}/ansible.log"
- "${BUILD_DIR}/logs" - "${BUILD_DIR}/logs"
base_test: base_test:
<<: *base_test <<: *base_test
variables: variables:
CREATE_ENV_VARS: "-e config.float_debian_dist=buster" VM_IMAGE: "bookworm"
TEST_DIR: "test/base.ref" TEST_DIR: "test/base.ref"
base_bullseye_test: trixie_test:
<<: *base_test <<: *base_test
# Need a more recent Ansible version, for Python 3.12 targets.
image: registry.git.autistici.org/ai3/docker/float-runner:trixie
variables: variables:
VM_IMAGE: "bullseye" VM_IMAGE: "trixie"
CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3" CREATE_ENV_VARS: "-e config.float_debian_dist=trixie"
TEST_DIR: "test/base.ref" TEST_DIR: "test/base.ref"
full_test: full_test:
<<: *base_test <<: *base_test
variables: variables:
CREATE_ENV_VARS: "-e config.float_debian_dist=buster" VM_IMAGE: "bookworm"
TEST_DIR: "test/full.ref" TEST_DIR: "test/full.ref"
rules:
- if: $CI_MERGE_REQUEST_ID == ''
full_bullseye_test: full_test_review:
<<: *base_test <<: *base_test
after_script:
- with-ssh-key ./test-driver cleanup --no-vagrant $BUILD_DIR
variables: variables:
VM_IMAGE: "bullseye" VM_IMAGE: "bookworm"
CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3" CREATE_ENV_VARS: "-e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
FLOATUP_ARGS: "--state-file .vmine_group_review_$CI_MERGE_REQUEST_ID --ttl 6h --env deploy.env --dashboard-url https://vm.investici.org"
TEST_DIR: "test/full.ref" TEST_DIR: "test/full.ref"
allow_failure: true
environment:
name: review/$CI_COMMIT_REF_SLUG
url: $VMINE_GROUP_URL
on_stop: stop_full_test_review
auto_stop_in: "6 hours"
rules:
- if: $CI_MERGE_REQUEST_ID
stop_full_test_review:
stage: cleanup
dependencies: [full_test_review]
image: registry.git.autistici.org/ai3/docker/float-runner:master
script:
- with-ssh-key floatup --state-file .vmine_group_review_$CI_MERGE_REQUEST_ID ${LIBVIRT:+--ssh $LIBVIRT} down
allow_failure: true
environment:
name: review/$CI_COMMIT_REF_SLUG
action: stop
rules:
- if: $CI_MERGE_REQUEST_ID
when: manual
#backup_test:
# <<: *base_test
# variables:
# VM_IMAGE: "bullseye"
# CREATE_ENV_VARS: "--additional-config test/backup.ref/config-backup.yml --playbook test/backup.ref/site.yml"
# TEST_DIR: "test/backup.ref"
docker_build_and_release_tests: docker_build_and_release_tests:
stage: docker_build stage: docker_build
image: docker:latest image: quay.io/podman/stable
services: tags: [podman]
- docker:dind
script: script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.git.autistici.org - echo -n "$CI_JOB_TOKEN" | podman login -u gitlab-ci-token --password-stdin $CI_REGISTRY
- cd test && docker build --build-arg ci_token=$CI_JOB_TOKEN --pull -t $IMAGE_TAG . - cd test && podman build --build-arg ci_token=$CI_JOB_TOKEN --pull -t $IMAGE_TAG .
- docker tag $IMAGE_TAG $CI_REGISTRY_IMAGE:integration-test - podman tag $IMAGE_TAG $CI_REGISTRY_IMAGE:integration-test
- docker push $CI_REGISTRY_IMAGE:integration-test - podman push $CI_REGISTRY_IMAGE:integration-test
only: only:
changes: changes:
- test/float_integration_test/** - test/float_integration_test/**
- test/Dockerfile
refs: refs:
- master - master
...@@ -77,8 +77,8 @@ on the local machine using [Go](https://golang.org): ...@@ -77,8 +77,8 @@ on the local machine using [Go](https://golang.org):
```shell ```shell
sudo apt-get install golang sudo apt-get install golang
go get -u git.autistici.org/ale/x509ca go install git.autistici.org/ale/x509ca@latest
go get -u git.autistici.org/ale/ed25519gen go install git.autistici.org/ale/ed25519gen@latest
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin
``` ```
......
# Build a simple float-based environment off a
# services.yml/passwords.yml combination.
#
# Example usage:
#
# include:
# - project: ai3/float
# file: ci/deploy.yml
#
# variables:
# FLOATUP_ARGS: "--ssh user@jump.host"
#
# deploy:
# variables:
# SERVICES_FILE: my-services.yml
# PASSWORDS_FILE: my-passwords.yml
#
stages:
- deploy
- run
# These are all configurable, globally and per-build.
variables:
SERVICES_FILE: services.yml
PASSWORDS_FILE: passwords.yml
PLAYBOOK_FILE: site.yml
ROLES_PATH: "${CI_PROJECT_DIR}/roles"
DOMAIN: example.com
NUM_HOSTS: 1
VM_IMAGE: bullseye
VM_RAM: 2048
VM_CPU: 1
VM_TTL: "4h"
VM_TAG: default
VM_DASHBOARD_URL: "https://vm.investici.org"
FLOATUP_ARGS: ""
CREATE_ENV_ARGS: ""
.deploy_template:
stage: deploy
image: registry.git.autistici.org/ai3/docker/float-runner:master
variables:
# Internal, do not change.
DEPLOY_ENV_FILE: "deploy-${VM_TAG}.env"
# This can be any temporary directory as long as it is unique
# (multiple jobs may be running off the same CI_PROJECT_DIR).
BUILD_DIR: "${CI_PROJECT_DIR}/env-${VM_TAG}-${CI_JOB_ID}"
VMINE_STATE_FILE: ".vmine_state_${CI_MERGE_REQUEST_ID}_${VM_TAG}"
before_script:
- mkdir -p $BUILD_DIR
script:
- (cd $BUILD_DIR && git clone --depth 1 https://git.autistici.org/ai3/float.git)
- >
$BUILD_DIR/float/float create-env
--domain=${DOMAIN}
--services=${BUILD_DIR}/float/services.core.yml
--services=${SERVICES_FILE}
--passwords=${BUILD_DIR}/float/passwords.yml.default
--passwords=${PASSWORDS_FILE}
--playbook=${BUILD_DIR}/float/playbooks/all.yml
--playbook=${PLAYBOOK_FILE}
--num-hosts=${NUM_HOSTS}
--roles-path=${ROLES_PATH}
-e ansible_cfg.defaults.strategy=mitogen_linear
-e config.docker_registry_url=${CI_REGISTRY}
-e config.docker_registry_username=${CI_REGISTRY_USER}
-e config.docker_registry_password=${CI_REGISTRY_PASSWORD}
${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}}
${CREATE_ENV_ARGS}
${BUILD_DIR}
- with-ssh-key floatup $FLOATUP_ARGS --inventory $BUILD_DIR/hosts.yml --ram $VM_RAM --cpu $VM_CPU --image $VM_IMAGE --state-file $VMINE_STATE_FILE --env $DEPLOY_ENV_FILE --ttl $VM_TTL --dashboard-url $VM_DASHBOARD_URL up
- with-ssh-key $BUILD_DIR/float/test-driver init --no-vagrant $BUILD_DIR
- echo BUILD_DIR=$BUILD_DIR >> $DEPLOY_ENV_FILE
allow_failure: true
artifacts:
when: always
expire_in: "1 day"
reports:
dotenv: "$DEPLOY_ENV_FILE"
paths:
- "${BUILD_DIR}"
- "$VMINE_STATE_FILE"
environment:
name: "review/$CI_COMMIT_REF_SLUG"
url: "$VMINE_GROUP_URL"
auto_stop_in: "3 hours"
.run_template:
stage: run
image: registry.git.autistici.org/ai3/docker/float-runner:master
script:
- cd $BUILD_DIR
- with-ssh-key ./float/float run -e docker_registry_password=$CI_REGISTRY_PASSWORD site.yml
- with-ssh-key ./float/float run -e docker_registry_password=$CI_REGISTRY_PASSWORD ./float/test/integration-test.yml
after_script:
- cd $BUILD_DIR
- with-ssh-key ./float/test-driver cleanup --no-vagrant .
artifacts:
when: always
paths:
- "${BUILD_DIR}/logs"
.stop_deploy_template:
stage: run
image: registry.git.autistici.org/ai3/docker/float-runner:master
allow_failure: true
variables:
# Internal, do not change.
VMINE_STATE_FILE: ".vmine_state_${CI_MERGE_REQUEST_ID}_${VM_TAG}"
script:
- with-ssh-key floatup $FLOATUP_ARGS --state-file $VMINE_STATE_FILE down
environment:
name: "review/$CI_COMMIT_REF_SLUG"
action: stop
deploy:
extends: .deploy_template
environment:
on_stop: "stop_deploy"
run:
extends: .run_template
stop_deploy:
extends: .stop_deploy_template
rules:
- when: manual
...@@ -11,13 +11,12 @@ stretch build host), and distribute it with alternative methods. ...@@ -11,13 +11,12 @@ stretch build host), and distribute it with alternative methods.
These can normally be built with standard Debian development tools, These can normally be built with standard Debian development tools,
such as *dpkg-buildpackage*. such as *dpkg-buildpackage*.
* [ai/sso](https://git.autistici.org/ai/sso)
* [id/auth](https://git.autistici.org/id/auth) * [id/auth](https://git.autistici.org/id/auth)
* [id/go-sso](https://git.autistici.org/id/go-sso) * [id/sso-server](https://git.autistici.org/id/sso-server)
* [id/keystore](https://git.autistici.org/id/keystore) * [id/keystore](https://git.autistici.org/id/keystore)
* [id/usermetadb](https://git.autistici.org/id/usermetadb) * [id/usermetadb](https://git.autistici.org/id/usermetadb)
* [ale/zonetool](https://git.autistici.org/ale/zonetool) * [ai3/tools/zonetool](https://git.autistici.org/ai3/tools/zonetool)
* [ai3/tools/cgroups-exporter](https://git.autistici.org/ai3/tools/cgroups-exporter) * [ai3/tools/cgroups-exporter](https://git.autistici.org/ai3/tools/cgroups-exporter)
* [ai3/tools/runcron](https://git.autistici.org/ai3/tools/runcron) * [ai3/tools/runcron](https://git.autistici.org/ai3/tools/runcron)
* [ai3/tools/audisp-json](https://git.autistici.org/ai3/tools/audisp-json) * [ai3/tools/audisp-json](https://git.autistici.org/ai3/tools/audisp-json)
...@@ -28,7 +27,7 @@ such as *dpkg-buildpackage*. ...@@ -28,7 +27,7 @@ such as *dpkg-buildpackage*.
* [ai3/tools/tabacco](https://git.autistici.org/ai3/tools/tabacco) * [ai3/tools/tabacco](https://git.autistici.org/ai3/tools/tabacco)
* [ai3/thirdparty/rsyslog-exporter](https://git.autistici.org/ai3/thirdparty/rsyslog-exporter) * [ai3/thirdparty/rsyslog-exporter](https://git.autistici.org/ai3/thirdparty/rsyslog-exporter)
* [ai3/thirdparty/restic](https://git.autistici.org/ai3/thirdparty/restic) * [ai3/thirdparty/litestream](https://git.autistici.org/ai3/thirdparty/litestream)
These are distributed via our own package repository at These are distributed via our own package repository at
*deb.autistici.org*, which currently supports the *amd64* and *arm64* *deb.autistici.org*, which currently supports the *amd64* and *arm64*
......
Playbook
===
This document describes how to perform some common operations in
*float*.
## Applying changes
### Rolling back the configuration
If you are using a Git repository as your configuration source,
*float* will keep track of which commit has been pushed to production
last, and it will try to prevent you from pushing an old version of
the configuration, failing immediately with an error. This is a simple
check to make sure that people do not inadvertently roll back the
production configuration by pushing from an out-of-date client.
In most cases what you want to do in that case is to simply run *git
pull* and bring your copy of the repository up to date. But if you
really need to push an old version of the configuration in an
emergency, you can do so by setting the *rollback* value to *true* on
the command-line:
```shell
$ float run -e rollback=true site.yml
```
## For administrators
### SSH Client Setup
If you delegated SSH management to float by setting *enable_ssh* to
true (see the [configuration reference](configuration.md)), float will
create a SSH CA to sign all your host keys.
You will find the public key for this CA in the
*credentials/ssh/key.pub* file, it will be created the first time you
run the "init-credentials" playbook.
Assuming that all your target hosts share the same domain (so you can
use a wildcard), you should add the following entry to
*~/.ssh/known_hosts*:
```
@cert_authority *.example.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAA....
```
Since all logins happen as root, it may be convenient to also add a
section to your *~/.ssh/config* file like the following:
```
Host *.example.com
User root
```
### Adding an admin account
Adding a new administrator account is just a matter of editing the
*admins* [configuration variable](configuration.md) and add a new
entry to it.
The first thing you will need is a hashed version of your
password. The authentication service in float supports a number of
legacy hashing schemes, including those supported by the system
crypt(). The most secure hashing scheme supported is Argon2, and you
can use our custom tool to generate a valid hash. To install it:
```shell
$ go install git.autistici.org/ai3/go-common/cmd/pwtool
```
Run the *pwtool* utility with your new password as an argument, as
shown below:
```shell
# Do not save your password in the history of your shell
$ export HISTIGNORE="./pwtool.amd64*"
$ ./pwtool.amd64 PASSWORD
```
where PASSWORD is your desired password.
It will output the hashed password.
Then modify the YAML file *group_vars/all/admins.yml*. At the bare
minimum the new account should have a *name*, *email*, *password* and
*ssh_keys* attributes, e.g.:
```yaml
---
admins:
- name: "foo"
email: "foo@example.com"
password: "$a2$3$32768$4$abcdef...."
ssh_keys:
- "ssh-ed25519 AAAAC3Nza..."
```
Here above "ssh_keys:" needs to be populated with your public key,
possibly stripped from the trailing user@hostname text (which may leak
your personal information), and "password:" must be the hashed
password you got from *pwtool* earlier.
### Setting up OTP for an admin account
First you need to manually generate the OTP secret on your computer:
```shell
$ SECRET=$(dd if=/dev/urandom bs=20 count=1 2>/dev/null | base32)
$ echo $SECRET
EVUVNACTWRAIERATIZUQA6YQ4WS63RN2
```
Install the package qrencode, and feed the OTP secret to it.
For example with apt ["apt install qrencode" of course].
```shell
$ EMAIL="sub@krutt.org"
$ qrencode -t UTF8 "otpauth://totp/example.com:${EMAIL}?secret=${SECRET}&issuer=example.com&algorithm=SHA1&digits=6&period=30"
```
and read the qrcode with your favourite app.
Then add it to your user object in *group_vars/all/admins.yml* as the
*totp_secret* attribute:
```yaml
---
admins:
- name: "foo"
totp_secret: "EVUVNACTWRAIERATIZUQA6YQ4WS63RN2"
...
```
Finally, configure your TOTP client (app, YubiKey, etc.) with the same
secret.
Note that the secret is stored in cleartext in the git repository, so
using a hardware token (U2F) is preferred.
### Registering a U2F hardware token for an admin account
In the *group_vars/all/admins.yml* file, you can add the
*u2f_registrations* attribute to accounts, which is a list of the
allowed U2F device registrations.
To register a new device, you are going to need the *pamu2fcfg* tool
(part of the *pamu2fcfg* Debian package). The following snippet should
produce the two YAML attributes that you need to set:
```shell
$ pamu2fcfg --nouser --appid https://accounts.example.com \
| tr -d : \
| awk -F, '{print "key_handle: \"" $1 "\"\npublic_key: \"" $2 "\""}'
```
press enter, touch the key, copy the output and insert it in
*group_vars/all/admins.yml*, the final results should look like:
```yaml
---
admins:
- name: "foo"
email: "foo@example.com"
password: "$a2$3$32768$4$abcdef...."
ssh_keys:
- "ssh-ed25519 AAAAC3Nza..."
u2f_registrations:
- key_handle: "r4wWRHgzJjl..."
public_key: "04803e4aff4..."
```
**NOTE**: the above will work with *pam_u2f* version 1.0.7, but it will *not*
work with pam_u2f version 1.1.0 due to changes in the output format!
...@@ -31,8 +31,8 @@ dipendenze possono essere installate con questo comando: ...@@ -31,8 +31,8 @@ dipendenze possono essere installate con questo comando:
```shell ```shell
sudo apt install golang ansible vagrant sudo apt install golang ansible vagrant
go get -u git.autistici.org/ale/x509ca go install git.autistici.org/ale/x509ca@latest
go get -u git.autistici.org/ale/ed25519gen go install git.autistici.org/ale/ed25519gen@latest
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin
``` ```
......
...@@ -35,8 +35,8 @@ other dependencies can be installed with the following commands: ...@@ -35,8 +35,8 @@ other dependencies can be installed with the following commands:
```shell ```shell
sudo apt install golang ansible vagrant sudo apt install golang ansible vagrant
go get -u git.autistici.org/ale/x509ca go install git.autistici.org/ale/x509ca@latest
go get -u git.autistici.org/ale/ed25519gen go install git.autistici.org/ale/ed25519gen@latest
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin
``` ```
...@@ -106,7 +106,7 @@ files for Ansible and Vagrant, with default values filled in by ...@@ -106,7 +106,7 @@ files for Ansible and Vagrant, with default values filled in by
*create-env* automatically generates a default *admin* user, with *create-env* automatically generates a default *admin* user, with
password *password*. password *password*.
You can read the [configuration reference](configuration.md) for You can read the [configuration reference](reference.md) for
details on the configuration file syntax and what the various options details on the configuration file syntax and what the various options
mean. mean.
......
...@@ -311,23 +311,42 @@ datasets only once (on the service master host). ...@@ -311,23 +311,42 @@ datasets only once (on the service master host).
### Backups ### Backups
If provided with credentials for an external data repository, float If provided with credentials for an external data repository, float
will automatically make backups of your configured datasets. Float will automatically make backups of your configured datasets. These
runs its own backup management system aren't just used for disaster recovery, but are an integral part of
([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of float's service management approach: when a service is scheduled on a
Restic, which adds additional metadata to Restic snapshots to map new host, for instance as a result of a re-scheduling, float will
float datasets. attempt to automatically restore the associated datasets from their
backups. Restores can of course also be triggered manually whenever
necessary.
When a service is scheduled on a new host, for instance as a result of Float offers two backup mechanisms for datasets:
a re-scheduling, float will attempt to restore the associated datasets
from their backups. While this is not a practical failover solution
for complex services, we've found it works pretty well for a category
of services with "important, but small - can afford to lose one day of
changes" datasets that is quite common and useful in itself. For these
services, running with num_instances=1 and counting on the
backup/restore data move mechanism might provide sufficient
availability and reliability.
Restores can of course also be triggered manually whenever necessary. * For bulk data, it can use its own backup management system
([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of
Restic, which adds additional metadata to Restic snapshots to map
float datasets. This can be used as a primitive failover solution
for services that aren't "important" enough to afford their own
distributed storage abstractions, and where losing up to one day of
changes is tolerable. An alternative, "live" solution, that would
favor correctness over availability, is also in the works. This
backup mechanism is *extensible* to understand the structure and
metadata of specific services' entities and accounts, if necessary.
* There are a number of instances, in float, of a specific category of
service, single-hosted small API services that run off a simple
SQLite database, some of which are critical to float's operation
(for example the backup metadata service itself). For this
particular use case, float supports backups with
[Litestream](https://litestream.io), an asynchronous replication
solution for SQLite, that offers point-in-time restore capabilities
(less than 1 second of data loss window) in case of disaster or when
the service is rescheduled.
Litestream requires an S3-compatible backend (Minio, AWS, etc).
Note that float does not, in its default configuration, provide the
data storage services used by its backup mechanisms. These are treated
as third-party (external) resources.
### Volumes ### Volumes
...@@ -549,7 +568,7 @@ Ansible roles to configure them. ...@@ -549,7 +568,7 @@ Ansible roles to configure them.
Note that, in its default setup, float will naturally assume a Note that, in its default setup, float will naturally assume a
two-tier service topology, with "frontend" hosts handling traffic two-tier service topology, with "frontend" hosts handling traffic
routing in a stateless fashion, and "backend" hosts running the actual routing in a stateless fashion, and "backend" hosts running the actual
services. The default *services.yml.default* service description file services. The default *services.default.yml* service description file
literally expects the *frontend* and *backend* Ansible groups to be literally expects the *frontend* and *backend* Ansible groups to be
defined in your inventory. However, these are just roles, and there is defined in your inventory. However, these are just roles, and there is
nothing inherent in float that limits you to this kind of topology. nothing inherent in float that limits you to this kind of topology.
...@@ -1556,6 +1575,8 @@ provided: ...@@ -1556,6 +1575,8 @@ provided:
specify a regex (with a capture group) to extract back the host specify a regex (with a capture group) to extract back the host
name from the target; the default regex will extract the short name from the target; the default regex will extract the short
host name from URLs and host:port targets. host name from URLs and host:port targets.
* (optionally) a *scrape_interval* if for some reason it should be
different than the default *prometheus_probe_scrape_interval*.
So, in the context of the previous example, if we wanted to probe So, in the context of the previous example, if we wanted to probe
another float service called *myservice*, which hypothetically serves another float service called *myservice*, which hypothetically serves
...@@ -1573,6 +1594,27 @@ prometheus_additional_blackbox_probers: ...@@ -1573,6 +1594,27 @@ prometheus_additional_blackbox_probers:
target_regex: "http://\\1:2020" target_regex: "http://\\1:2020"
``` ```
### Customizing alert timeouts for additional blackbox probes
The Prometheus configuration for the default *float* blackbox probes
is appropriate for high-frequency, high-accuracy probes (with 10s
polling and a 5m alert timeout). This is not going to be appropriate
for all use cases, such as more complex probes that require less
frequent polling.
Float provides a way to configure alert timeouts on a *prober*
(i.e. float service) basis, by using the optional
*prober_alert_timeout* attribute in the service description
metadata. For instance, to set a 30 minute alert timeout in the
context of the previous example, the services.yml file should be
modified:
```yaml
my-prober:
...
prober_alert_timeout: 30m
```
## Log Collection and Analysis ## Log Collection and Analysis
Logs are forwarded by all machines to a set of (one or more) Logs are forwarded by all machines to a set of (one or more)
...@@ -1789,7 +1831,7 @@ pairs that define group variables. ...@@ -1789,7 +1831,7 @@ pairs that define group variables.
### Groups ### Groups
While you can define any host groups you want, the default service While you can define any host groups you want, the default service
configuration in float (*services.yml.default*) expects you to define configuration in float (*services.default.yml*) expects you to define
at least two: at least two:
* *frontend*, for the public-facing reverse proxy hosts * *frontend*, for the public-facing reverse proxy hosts
...@@ -1808,19 +1850,21 @@ Variables can be Ansible variables: SSH parameters, etc., usually with ...@@ -1808,19 +1850,21 @@ Variables can be Ansible variables: SSH parameters, etc., usually with
an *ansible_* prefix. But some host variables have special meaning for an *ansible_* prefix. But some host variables have special meaning for
float: float:
`ip` (mandatory) is the IPv4 address of this host that other hosts `ips` (mandatory) is the list of IP addresses of this host that other
(i.e. internal services) should use to reach it hosts (i.e. internal services) should use to reach it. You can specify
one or more IP addresses, IPv4 or IPv6. Note that this is a **list**.
For legacy reasons, float still also understands the `ip` (singular)
attribute, which is expected to be a single IPv4 address, but this
support will eventually be retired, so on new inventories you should
use the `ips` list attribute.
`ip6` (optional) is the IPv6 version of the above `public_ips` (optional) is the list of IP addresses for this host that
will be advertised in the public-facing DNS zones. If unset it
`public_ip` (optional) is the IPv4 address that will be advertised in defaults to `ips`.
the public-facing DNS zones, if unset it defaults to `ip`
`public_ip6` (optional) is the IPv6 version of the above (if unset,
it will default to `ip6`)
`ip_<name>` (optional) defines the IPv4 address for this host on the `ip_<name>` (optional) defines the IPv4 address for this host on the
overlay network called *name* overlay network called *name*. Note that as opposed to `ips` this is
not a list but a single IPv4 address.
`groups` (optional) is a list of Ansible groups that this host should `groups` (optional) is a list of Ansible groups that this host should
be a member of be a member of
...@@ -1880,12 +1924,12 @@ Service metadata is encoded as a dictionary of *service name*: ...@@ -1880,12 +1924,12 @@ Service metadata is encoded as a dictionary of *service name*:
Metadata for services that are part of the core infrastructure ships Metadata for services that are part of the core infrastructure ships
embedded with this repository, so when writing your own `services.yml` embedded with this repository, so when writing your own `services.yml`
file, you only need to add your services to it. You should include the file, you only need to add your services to it. You should include the
*services.yml.default* file shipped with the float source, which *services.default.yml* file shipped with the float source, which
defines all the built-in services: defines all the built-in services:
```yaml ```yaml
include: include:
- "/path/to/float/services.yml.default" - "/path/to/float/services.default.yml"
``` ```
The `include` directive is special: it does not define a service, but The `include` directive is special: it does not define a service, but
...@@ -1979,21 +2023,24 @@ service. ...@@ -1979,21 +2023,24 @@ service.
Each entry in the *monitoring_endpoints* list can have the following Each entry in the *monitoring_endpoints* list can have the following
attributes: attributes:
`job_name`: Job name in Prometheus, defaults to the service name.
`type` (deprecated): Selects the service discovery mechanism used by
Prometheus to find the service endpoints. This can only have the value
*static*, which is also the default.
`port`: Port where the `/metrics` endpoint is exported. `port`: Port where the `/metrics` endpoint is exported.
`scheme`: HTTP scheme for the service endpoint. The default is *https*. `scheme`: HTTP scheme for the service endpoint. The default is *https*.
`healthcheck_http_method`: HTTP method to use for checking job status. The default is *HEAD* to query the endpoint without transferring all the metric data. Not all endpoints support this method, so if the probe fails set it to a method that it does support (worst case: *GET*).
`metrics_path`: Path for metrics if different from the default of `/metrics`. `metrics_path`: Path for metrics if different from the default of `/metrics`.
`labels`: An optional dictionary of key/value labels to set for this `labels`: An optional dictionary of key/value labels to set for this
target (they will be added to all metrics scraped from it). target (they will be added to all metrics scraped from it).
`scrape_interval`: Optionally override the scrape interval for this
target.
The Prometheus *job* labels for service targets will be automatically
generated by *float* to include the service name and the endpoint
port.
### Traffic routing ### Traffic routing
Services can define *public* HTTP and TCP endpoints, that will be Services can define *public* HTTP and TCP endpoints, that will be
...@@ -2041,6 +2088,12 @@ using single sign-on, allowing access only to administrators (members ...@@ -2041,6 +2088,12 @@ using single sign-on, allowing access only to administrators (members
of the *admins* group). This is quite useful for admin web interfaces of the *admins* group). This is quite useful for admin web interfaces
of internal services that do not support SSO integration of their own. of internal services that do not support SSO integration of their own.
`enable_api_proxy`: If true, place the service behind authentication
using a mechanism more appropriate for non-interactive APIs (HTTP
Basic Authentication using Application-Specific Passwords). Only members
of the *admins* group will have access. When this option is set, you
also need to specify a unique `auth_service` to be used for ASPs.
#### HTTP (All domains) #### HTTP (All domains)
`horizontal_endpoints`: List of HTTP endpoints exported by the `horizontal_endpoints`: List of HTTP endpoints exported by the
...@@ -2067,10 +2120,10 @@ attributes, all required: ...@@ -2067,10 +2120,10 @@ attributes, all required:
`name`: Name of the endpoint. `name`: Name of the endpoint.
`port`: Port where the service is running. Also the port that will be `port`: Port where the service is running.
publicly exported (at least in the current implementation), which
unfortunately means that the service itself shouldn't be running on `public_port`: Port that should be exposed to the Internet. Defaults
*frontend* nodes. to `port` if unset.
`use_proxy_protocol`: When true, enable the HAProxy proxy protocol for `use_proxy_protocol`: When true, enable the HAProxy proxy protocol for
the service, to propagate the original client IP to the backends. the service, to propagate the original client IP to the backends.
...@@ -2126,6 +2179,19 @@ option automatically sets *drop_capabilities* to false. ...@@ -2126,6 +2179,19 @@ option automatically sets *drop_capabilities* to false.
drop all capabilities for this container. Otherwise, the capability drop all capabilities for this container. Otherwise, the capability
set will be controlled by systemd. set will be controlled by systemd.
`egress_policy` (default: *allow-all*): selects the network egress
policy for this container. This allows broad control over network
connections made by the process running in the container, and it can
take one of the following values:
* *allow-all*, allows all traffic
* *internal*, only allows traffic to float's internal private networks
(necessary for containers serving public_endpoints, of course)
* *none*, only allows traffic to localhost
These policies are implemented using BPF filters, which at the moment
are quite simplistic, hence the limited configurability.
### Non-container services ### Non-container services
`systemd_services`: List of systemd service units that are associated `systemd_services`: List of systemd service units that are associated
...@@ -2212,6 +2278,37 @@ The LVs are created in the volume specified by the `volumes_vg` global ...@@ -2212,6 +2278,37 @@ The LVs are created in the volume specified by the `volumes_vg` global
configuration variable, which by default is *vg0*. The VG must already configuration variable, which by default is *vg0*. The VG must already
exist, float will not attempt to create it. exist, float will not attempt to create it.
### Annotations
`annotations`: Dictionary with service-specific annotations
Annotations are manually curated metadata associated with the service,
intended for debugging purposes. This is data meant for humans to
consume, with the idea of helping the operators understand and debug
your services and their interconnections.
Annotations are for now only displayed on the float admin dashboard.
`summary`: A short summary (description) of the service.
#### Dependency graphs
`dependencies`: A list of additional service dependencies.
Float can automatically compute part of the dependency graph between
your services, at least insofar as the structure of *public_endpoints*
is concerned. Since this data can be quite useful in understanding the
structure of a service, it is possible to extend the dependency graph
manually by specifying additional edges (representing the dependencies
between services).
Edges of the dependency graphs are specified as objects with `client`
and `server` attributes, identifying a specific container or systemd
unit in either the current service or a different one. If you're
referring to an entity within the same service, you can just use its
name, while for external services the syntax is
*service-name*/*entity-name* (e.g. "log-collector/elasticsearch").
### Examples ### Examples
Let's look at some example *services.yml* files: Let's look at some example *services.yml* files:
...@@ -2361,8 +2458,11 @@ each a dictionary with the following attributes: ...@@ -2361,8 +2458,11 @@ each a dictionary with the following attributes:
documentation](https://git.autistici.org/id/auth/blob/master/README.md#password-encoding). documentation](https://git.autistici.org/id/auth/blob/master/README.md#password-encoding).
* `totp_secret` - TOTP secret for 2FA, base32-encoded * `totp_secret` - TOTP secret for 2FA, base32-encoded
* `ssh_keys` - a list of strings representing SSH public keys * `ssh_keys` - a list of strings representing SSH public keys
* `u2f_registrations` - a list of objects representing U2F token * `webauthn_registrations` - a list of objects representing
registrations WebAuthN(U2F) token registrations
* `u2f_registrations` - a list of objects representing legacy U2F
token registrations, only supported for old registrations created
before the switch to WebAuthN. Don't add new entries to this list.
### Authentication and SSO ### Authentication and SSO
...@@ -2400,9 +2500,12 @@ attributes that specify static DNS entries that will be added to ...@@ -2400,9 +2500,12 @@ attributes that specify static DNS entries that will be added to
`nginx_cache_keys_mem` is the memory size of the key buffer for the `nginx_cache_keys_mem` is the memory size of the key buffer for the
global NGINX HTTP cache. global NGINX HTTP cache.
`nginx_cache_fs_size` is the maximum on-disk size of the NGINX HTTP `nginx_cache_custom_params` are additional parameters for customizing
cache (note that NGINX might use as much as twice what specified here, the *proxy_cache_path* NGINX configuration directive for the global
depending on expiration policy). cache. The most important attribute you might want to set is possibly
*max_size*, which controls the maximum size of the on-disk cache (note
that NGINX might use as much as twice what specified, depending on
expiration policy).
`nginx_global_custom_headers` - a dictionary of {header: value} pairs `nginx_global_custom_headers` - a dictionary of {header: value} pairs
corresponding to HTTP headers that must be set on *every* response. corresponding to HTTP headers that must be set on *every* response.
...@@ -2417,8 +2520,8 @@ tuples used for redirecting top-level domains to specific destinations ...@@ -2417,8 +2520,8 @@ tuples used for redirecting top-level domains to specific destinations
service which is normally part of the log-collector infrastructure. As service which is normally part of the log-collector infrastructure. As
this is a large Java daemon with significant memory requirements, it this is a large Java daemon with significant memory requirements, it
is often useful to disable it for testing environments. Note that in is often useful to disable it for testing environments. Note that in
this case one should also import *services.yml.no-elasticsearch* this case one should import *services.core.yml*
instead of the default *services.yml.default*. instead of the default *services.default.yml*.
`es_log_keep_days` is a dictionary that specifies the retention time `es_log_keep_days` is a dictionary that specifies the retention time
for the various log types, in days. The default is `{ audit: 60, for the various log types, in days. The default is `{ audit: 60,
...@@ -2444,6 +2547,10 @@ instances should scrape their targets (default 10s). ...@@ -2444,6 +2547,10 @@ instances should scrape their targets (default 10s).
`prometheus_lts_scrape_interval` sets how often the long-term `prometheus_lts_scrape_interval` sets how often the long-term
Prometheus instances should scrape the primary ones (default 1m). Prometheus instances should scrape the primary ones (default 1m).
`prometheus_probe_scrape_interval` controls the default
scrape_interval setting for all blackbox probes, and it just defaults
to the value of prometheus_scrape_interval if unset.
`prometheus_external_targets` allows adding additional targets to Prometheus `prometheus_external_targets` allows adding additional targets to Prometheus
beyond those that are described by the service metadata. It is a list of entries beyond those that are described by the service metadata. It is a list of entries
with *name*, *targets* attributes. Optionally, you may specify a *scheme* with *name*, *targets* attributes. Optionally, you may specify a *scheme*
...@@ -2483,6 +2590,15 @@ to be notified about resolved alerts (default False). ...@@ -2483,6 +2590,15 @@ to be notified about resolved alerts (default False).
### Third-party services ### Third-party services
#### ACME
Float's ACME certificate generation service does not require any
configuration, as it will automatically generate a Letsencrypt
account. It is possible, however, to tell it to use a specific account
by providing it with a private key:
`acme_private_key` - ACME private key, in PEM format
#### Private Docker registry #### Private Docker registry
You can have float use a private Docker registry by providing it with You can have float use a private Docker registry by providing it with
...@@ -2543,16 +2659,53 @@ but it will still be active and functional (via *amtool*). ...@@ -2543,16 +2659,53 @@ but it will still be active and functional (via *amtool*).
#### Backups #### Backups
To configure the backup system, you're going to need credentials for To configure the backup system, you're going to need credentials for
an external repository. The backup system the third-party (external) data storage services. While it is possible
uses [restic](https://restic.net), so check its documentation for the to run a production service *without* backups configured, note that
URI syntax. the cluster's functionality will be incomplete unless at least a
Litestream backend is configured.
##### Bulk backup (Restic)
`backup_repository_uri` - URI of the global (shared) restic
repository. Though Restic supports [numerous
backends](https://restic.readthedocs.io/en/stable/030_preparing_a_new_repo.html),
float works best with Restic's own [REST
Server](https://github.com/restic/rest-server).
`backup_repository_restic_password` - password used to encrypt the
restic repository.
##### Asynchronous SQLite replication (Litestream)
`backup_repository_uri` - URI of the global (shared) restic repository Litestream requires a S3-compatible API to store its SQLite WAL
snapshots.
`backup_repository_restic_password` - the password used to encrypt `backup_litestream_config` is the object that configures the
the restic repository. Litestream replica target, and it corresponds to the "replica" field
of the Litestream configuration, so you can check the [Litestream
documentation](https://litestream.io/reference/config/#replica-settings)
for reference. The most important fields to set are `endpoint` (the
URL of the storage service API), and `bucket` (the name of the bucket
to use). The *path* attribute will be automatically set by float,
based on the dataset name.
`backup_litestream_credentials` is a dictionary of environment
variables to configure credentials for access to the backend storage
service. Keys will depend on which type of API is being used, but for
the default *s3* type they should be `LITESTREAM_ACCESS_KEY_ID` and
`LITESTREAM_SECRET_ACCESS_KEY`.
An example of a (fictional) litestream configuration:
```yaml
backup_litestream_config:
type: s3
endpoint: "https://backup.service:9000/"
bucket: "mybackups"
backup_litestream_credentials:
LITESTREAM_ACCESS_KEY_ID: "minio"
LITESTREAM_SECRET_ACCESS_KEY: "miniopassword"
```
# Operations # Operations
...@@ -2568,8 +2721,8 @@ unsupported. ...@@ -2568,8 +2721,8 @@ unsupported.
```shell ```shell
sudo apt install golang ansible sudo apt install golang ansible
go get git.autistici.org/ale/x509ca go install git.autistici.org/ale/x509ca@latest
go get git.autistici.org/ale/ed25519gen go install git.autistici.org/ale/ed25519gen@latest
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin
``` ```
...@@ -2654,7 +2807,7 @@ There are some minimal requirements on how your Ansible environment ...@@ -2654,7 +2807,7 @@ There are some minimal requirements on how your Ansible environment
should be set up for this to work: should be set up for this to work:
* you must have a *group_vars/all* directory (this is where we'll * you must have a *group_vars/all* directory (this is where we'll
write the autogenerated application credentials file *secrets.yml*q) write the autogenerated application credentials file *secrets.yml*)
* you must include float's *playbooks/all.yml* playbook file from the * you must include float's *playbooks/all.yml* playbook file from the
toolkit source directory at the beginning of your playbook toolkit source directory at the beginning of your playbook
* you should use the *float* wrapper instead of running * you should use the *float* wrapper instead of running
...@@ -3097,7 +3250,7 @@ Install the package qrencode, and feed the OTP secret to it. ...@@ -3097,7 +3250,7 @@ Install the package qrencode, and feed the OTP secret to it.
For example with apt ["apt install qrencode" of course]. For example with apt ["apt install qrencode" of course].
```shell ```shell
$ EMAIL="sub@krutt.org" $ EMAIL="foo@example.com"
$ qrencode -t UTF8 "otpauth://totp/example.com:${EMAIL}?secret=${SECRET}&issuer=example.com&algorithm=SHA1&digits=6&period=30" $ qrencode -t UTF8 "otpauth://totp/example.com:${EMAIL}?secret=${SECRET}&issuer=example.com&algorithm=SHA1&digits=6&period=30"
``` ```
...@@ -3123,21 +3276,23 @@ using a hardware token (U2F) is preferred. ...@@ -3123,21 +3276,23 @@ using a hardware token (U2F) is preferred.
### Registering a U2F hardware token for an admin account ### Registering a U2F hardware token for an admin account
In the *group_vars/all/admins.yml* file, you can add the In the *group_vars/all/admins.yml* file, you can add the
*u2f_registrations* attribute to accounts, which is a list of the *webauthn_registrations* attribute to accounts, which is a list of the
allowed U2F device registrations. allowed WebAuthN/U2F device registrations.
To register a new device, you are going to need the *pamu2fcfg* tool To register a new device, you are going to need to install another
(part of the *pamu2fcfg* Debian package). The following snippet should small custom tool:
produce the two YAML attributes that you need to set: [webauthn-cred](https://git.autistici.org/ai3/tools/webauthn-cred). Follow
its installation instructions to obtain the *webauthn-cred* binary,
then invoke it to make a new registration:
```shell ```shell
$ pamu2fcfg --nouser --appid https://accounts.example.com \ $ webauthn-cred --rpid accounts.example.com
| tr -d : \
| awk -F, '{print "key_handle: \"" $1 "\"\npublic_key: \"" $2 "\""}'
``` ```
press enter, touch the key, copy the output and insert it in follow the instructions, copy the output and insert it in
*group_vars/all/admins.yml*, the final results should look like: *group_vars/all/admins.yml* as a new item in the
*webauthn_registrations* attribute of your user. The final results
should look like:
```yaml ```yaml
--- ---
...@@ -3147,14 +3302,11 @@ admins: ...@@ -3147,14 +3302,11 @@ admins:
password: "$a2$3$32768$4$abcdef...." password: "$a2$3$32768$4$abcdef...."
ssh_keys: ssh_keys:
- "ssh-ed25519 AAAAC3Nza..." - "ssh-ed25519 AAAAC3Nza..."
u2f_registrations: webauthn_registrations:
- key_handle: "r4wWRHgzJjl..." - key_handle: "r4wWRHgzJjl..."
public_key: "04803e4aff4..." public_key: "ajgh73-31bc..."
``` ```
**NOTE**: the above will work with *pam_u2f* version 1.0.7, but it will *not*
work with pam_u2f version 1.1.0 due to changes in the output format!
### Upgrading Debian version on target hosts ### Upgrading Debian version on target hosts
Float generally targets the current Debian *stable* distribution, but Float generally targets the current Debian *stable* distribution, but
...@@ -3168,13 +3320,25 @@ process: ...@@ -3168,13 +3320,25 @@ process:
* Set *float_debian_dist* to the new codename (e.g. "buster") in your * Set *float_debian_dist* to the new codename (e.g. "buster") in your
group_vars/all configuration. group_vars/all configuration.
* Run *float*, which will install the correct APT sources for the new * Run *float*, which will install the correct APT sources for the new
release. release and upgrade the servers.
* Run *apt dist-upgrade* manually or via Ansible. This part is not * Reboot the servers into the new kernels.
automated yet due to the large variety in possible scenarios.
* Run *float* again: it will now detect that the distribution has If you want more control over this process (Debian upgrades have been
changed and reconfigure packages as needed. event-less for a while now, but it's not always been the case) you
can of course run the upgrade manually.
### Decommissioning a host
When turning down a host, it is necessary, at some point, to
reschedule the services that were there onto some other hosts. To
achieve a smooth transition, this is best done while the host is still
available.
To do this, set the *turndown* attribute to *true* in the inventory
for the host you want to turn down, and then run *float* once more.
This should safely reschedule all services, and remove them from the
target host. It is then possible to simply shut down the target host
and wipe its data.
# Example scenarios # Example scenarios
...@@ -3201,7 +3365,7 @@ available) for the service. ...@@ -3201,7 +3365,7 @@ available) for the service.
```yaml ```yaml
include: include:
- "/path/to/float/services.yml.default" - "/path/to/float/services.default.yml"
ok: ok:
scheduling_group: backend scheduling_group: backend
num_instances: 1 num_instances: 1
...@@ -3276,7 +3440,7 @@ The services.yml file: ...@@ -3276,7 +3440,7 @@ The services.yml file:
```yaml ```yaml
include: include:
- "/path/to/float/services.yml.default" - "/path/to/float/services.default.yml"
videoconf: videoconf:
scheduling_group: videoconf scheduling_group: videoconf
num_instances: all num_instances: all
......
No preview for this file type
...@@ -51,7 +51,7 @@ Vagrant.configure(2) do |config| ...@@ -51,7 +51,7 @@ Vagrant.configure(2) do |config|
libvirt.memory = {{ ram }} libvirt.memory = {{ ram }}
libvirt.random_hostname = true libvirt.random_hostname = true
libvirt.cpu_mode = 'host-passthrough' libvirt.cpu_mode = 'host-passthrough'
libvirt.volume_cache = 'unsafe' libvirt.disk_driver :cache => 'unsafe'
{% if libvirt.remote_host %} {% if libvirt.remote_host %}
libvirt.host = "{{ libvirt.remote_host }}" libvirt.host = "{{ libvirt.remote_host }}"
libvirt.username = "{{ libvirt.remote_user }}" libvirt.username = "{{ libvirt.remote_user }}"
...@@ -65,7 +65,7 @@ Vagrant.configure(2) do |config| ...@@ -65,7 +65,7 @@ Vagrant.configure(2) do |config|
hosts.each do |hostname, hostvars| hosts.each do |hostname, hostvars|
config.vm.define hostname do |m| config.vm.define hostname do |m|
m.vm.hostname = hostname m.vm.hostname = hostname
m.vm.network "private_network", ip: hostvars["ip"], libvirt__dhcp_enabled: false, libvirt__network_name: network_name m.vm.network "private_network", ip: hostvars["ips"][0], libvirt__dhcp_enabled: false, libvirt__network_name: network_name
end end
end end
end end
...@@ -116,14 +116,18 @@ VVSaq+sWqN+ugjpj9sJ++/O1uSiUPNZdIwIBAg== ...@@ -116,14 +116,18 @@ VVSaq+sWqN+ugjpj9sJ++/O1uSiUPNZdIwIBAg==
'services.yml': '''--- 'services.yml': '''---
include: include:
{% if services_yml_path %} {% if services_yml_path %}
- "{{ services_yml_path | relpath(targetdir) }}" {% for p in services_yml_path %}
- "{{ p | relpath(targetdir) }}"
{% endfor %}
{% else %} {% else %}
- "{{ srcdir | relpath(targetdir) }}/services.yml.no-elasticsearch" - "{{ srcdir | relpath(targetdir) }}/services.core.yml"
{% endif %} {% endif %}
''', ''',
'passwords.yml': '''--- 'passwords.yml': '''---
{% if passwords_yml_path %} {% if passwords_yml_path %}
- include: "{{ passwords_yml_path | relpath(targetdir) }}" {% for p in passwords_yml_path %}
- include: "{{ p | relpath(targetdir) }}"
{% endfor %}
{% else %} {% else %}
- include: "{{ srcdir | relpath(targetdir) }}/passwords.yml.default" - include: "{{ srcdir | relpath(targetdir) }}/passwords.yml.default"
{% endif %} {% endif %}
...@@ -142,8 +146,8 @@ DEFAULT_VARS = { ...@@ -142,8 +146,8 @@ DEFAULT_VARS = {
# Paths, some set by command-line options. # Paths, some set by command-line options.
'srcdir': SRCDIR, 'srcdir': SRCDIR,
'targetdir': None, 'targetdir': None,
'services_yml_path': None, 'services_yml_path': [],
'passwords_yml_path': None, 'passwords_yml_path': [],
'playbooks': [], 'playbooks': [],
# Memory for the virtual machines (MB). # Memory for the virtual machines (MB).
...@@ -158,13 +162,7 @@ DEFAULT_VARS = { ...@@ -158,13 +162,7 @@ DEFAULT_VARS = {
# Ansible inventory (hosts are created dynamically). # Ansible inventory (hosts are created dynamically).
'inventory': { 'inventory': {
'hosts': {}, 'hosts': {},
'group_vars': { 'group_vars': {},
'vagrant': {
'ansible_user': 'vagrant',
'ansible_become': True,
'ansible_ssh_private_key_file': '~/.vagrant.d/insecure_private_key',
},
},
}, },
# Ansible configuration. # Ansible configuration.
...@@ -177,10 +175,12 @@ DEFAULT_VARS = { ...@@ -177,10 +175,12 @@ DEFAULT_VARS = {
'callback_plugins': '{{ srcdir | relpath(targetdir) }}/plugins/callback', 'callback_plugins': '{{ srcdir | relpath(targetdir) }}/plugins/callback',
'force_handlers': True, 'force_handlers': True,
'log_path': 'ansible.log', 'log_path': 'ansible.log',
'retry_files_enabled': False,
'interpreter_python': '/usr/bin/python3',
'nocows': 1, 'nocows': 1,
'display_skipped_hosts': False, 'display_skipped_hosts': False,
'callback_whitelist': 'float_ci', 'callbacks_enabled': 'float_ci',
'stdout_callback': 'float_ci', 'stdout_callback': 'float_ci',
'host_key_checking': False, 'host_key_checking': False,
'forks': 50, 'forks': 50,
...@@ -203,7 +203,7 @@ DEFAULT_VARS = { ...@@ -203,7 +203,7 @@ DEFAULT_VARS = {
'domain_public': [], 'domain_public': [],
'testing': True, 'testing': True,
'float_debian_dist': 'bullseye', 'float_debian_dist': 'bookworm',
'net_overlays': [{ 'net_overlays': [{
'name': 'vpn0', 'name': 'vpn0',
'network': '192.168.13.0/24', 'network': '192.168.13.0/24',
...@@ -296,7 +296,7 @@ def _random_hosts(num_hosts, extra_memberships): ...@@ -296,7 +296,7 @@ def _random_hosts(num_hosts, extra_memberships):
hostvars = { hostvars = {
'name': hostname, 'name': hostname,
'ansible_host': f'{net}.{i+10}', 'ansible_host': f'{net}.{i+10}',
'ip': f'{net}.{i+10}', 'ips': [f'{net}.{i+10}'],
'ip_vpn0': f'192.168.13.{i+10}', 'ip_vpn0': f'192.168.13.{i+10}',
} }
hostgroups = ['vagrant'] hostgroups = ['vagrant']
...@@ -341,7 +341,7 @@ def _render_skel(target_dir, ctx): ...@@ -341,7 +341,7 @@ def _render_skel(target_dir, ctx):
def command_create_env(path, services, passwords, playbooks, def command_create_env(path, services, passwords, playbooks,
roles_path, num_hosts, additional_host_groups, roles_path, num_hosts, additional_host_groups,
additional_configs, ram, domain, infra_domain, additional_configs, ram, domain, infra_domain,
extra_vars): become, extra_vars):
all_vars = DEFAULT_VARS all_vars = DEFAULT_VARS
# Set paths in the internal config. # Set paths in the internal config.
...@@ -350,6 +350,20 @@ def command_create_env(path, services, passwords, playbooks, ...@@ -350,6 +350,20 @@ def command_create_env(path, services, passwords, playbooks,
all_vars['passwords_yml_path'] = passwords all_vars['passwords_yml_path'] = passwords
all_vars['playbooks'] = playbooks all_vars['playbooks'] = playbooks
# Set connection-related user parameters.
if become == 'root':
all_vars['inventory']['group_vars']['vagrant'] = {
'ansible_user': 'root',
'ansible_become': False,
}
else:
all_vars['inventory']['group_vars']['vagrant'] = {
'ansible_user': become,
'ansible_become': True,
# For legacy compatibility reasons.
'ansible_ssh_private_key_file': '~/.vagrant.d/insecure_private_key',
}
# Extend the Ansible roles_path. # Extend the Ansible roles_path.
if roles_path: if roles_path:
for rpath in roles_path.split(':'): for rpath in roles_path.split(':'):
...@@ -359,14 +373,13 @@ def command_create_env(path, services, passwords, playbooks, ...@@ -359,14 +373,13 @@ def command_create_env(path, services, passwords, playbooks,
# Catch ValueError to handle parsing errors for composite-valued # Catch ValueError to handle parsing errors for composite-valued
# options and print a friendly message. # options and print a friendly message.
try: try:
all_vars['inventory']['hosts'] = _random_hosts( extra_memberships = _parse_additional_host_groups(additional_host_groups)
num_hosts,
_parse_additional_host_groups(additional_host_groups),
)
except ValueError: except ValueError:
print('Unable to parse additional-host-group spec', file=sys.stderr) print('Unable to parse additional-host-group spec', file=sys.stderr)
return 1 return 1
all_vars['inventory']['hosts'] = _random_hosts(num_hosts, extra_memberships)
all_vars['ram'] = ram all_vars['ram'] = ram
all_vars['config']['domain_public'] = [domain] all_vars['config']['domain_public'] = [domain]
all_vars['config']['domain'] = ( all_vars['config']['domain'] = (
...@@ -436,7 +449,7 @@ def command_run(config, playbooks, ...@@ -436,7 +449,7 @@ def command_run(config, playbooks,
print('Running playbook %s...' % (arg,)) print('Running playbook %s...' % (arg,))
os.environ['LC_ALL'] = 'C' os.environ['LC_ALL'] = 'C.UTF-8'
_fix_ansible_vault_password_file() _fix_ansible_vault_password_file()
cmd = [os.getenv('ANSIBLE_PLAYBOOK', 'ansible-playbook'), cmd = [os.getenv('ANSIBLE_PLAYBOOK', 'ansible-playbook'),
'-i', config] '-i', config]
...@@ -528,9 +541,11 @@ memberships, using the --additional-host-group command-line option. ...@@ -528,9 +541,11 @@ memberships, using the --additional-host-group command-line option.
help='infrastructural domain to use (default: "infra." + domain)') help='infrastructural domain to use (default: "infra." + domain)')
create_env_parser.add_argument( create_env_parser.add_argument(
'--services', metavar='FILE', '--services', metavar='FILE',
action='append', default=[],
help='your custom services.yml') help='your custom services.yml')
create_env_parser.add_argument( create_env_parser.add_argument(
'--passwords', metavar='FILE', '--passwords', metavar='FILE',
action='append', default=[],
help='your custom passwords.yml') help='your custom passwords.yml')
create_env_parser.add_argument( create_env_parser.add_argument(
'--playbook', metavar='FILE', '--playbook', metavar='FILE',
...@@ -542,6 +557,9 @@ memberships, using the --additional-host-group command-line option. ...@@ -542,6 +557,9 @@ memberships, using the --additional-host-group command-line option.
create_env_parser.add_argument( create_env_parser.add_argument(
'--ram', metavar='MB', type=int, default=3072, '--ram', metavar='MB', type=int, default=3072,
help='RAM for each VM when using --vagrant (default: 3072)') help='RAM for each VM when using --vagrant (default: 3072)')
create_env_parser.add_argument(
'--become', metavar='USER', default='root',
help='ansible_user, disable ansible_become if "root"')
create_env_parser.add_argument( create_env_parser.add_argument(
'--additional-host-group', metavar='GROUP=HOST1[,HOST2...]', '--additional-host-group', metavar='GROUP=HOST1[,HOST2...]',
dest='additional_host_groups', dest='additional_host_groups',
......
...@@ -43,3 +43,6 @@ ...@@ -43,3 +43,6 @@
roles: roles:
- float-infra-sso-server - float-infra-sso-server
- hosts: assets
roles:
- float-infra-assetmon
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
- hosts: all - hosts: all
tasks: tasks:
- copy:
src: ../roles/float-base/files/apt/deb_autistici_org.gpg
dest: /usr/share/keyrings/deb.autistici.org.gpg
- apt: - apt:
update_cache: yes update_cache: yes
upgrade: "yes" upgrade: "yes"
......
...@@ -13,10 +13,10 @@ ...@@ -13,10 +13,10 @@
roles: roles:
- float-infra-dns - float-infra-dns
- hosts: admin_dashboard - hosts: service_dashboard
gather_facts: no gather_facts: no
roles: roles:
- float-infra-admin-dashboard - float-infra-service-dashboard
- hosts: acme - hosts: acme
gather_facts: no gather_facts: no
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
- dnssec - dnssec
- ssh - ssh
- sso - sso
- x509
# First of all, generate secrets from the passwords.yml file. # First of all, generate secrets from the passwords.yml file.
- name: Initialize secrets - name: Initialize secrets
...@@ -50,8 +49,17 @@ ...@@ -50,8 +49,17 @@
- name: Generate SSO credentials - name: Generate SSO credentials
local_action: ed25519 privkey="{{ credentials_dir }}/sso/secret.key" pubkey="{{ credentials_dir }}/sso/public.key" local_action: ed25519 privkey="{{ credentials_dir }}/sso/secret.key" pubkey="{{ credentials_dir }}/sso/public.key"
- name: Generate global DH params - set_fact:
local_action: command openssl dhparam -out "{{ credentials_dir }}/x509/dhparam" "{{ dhparam_bits | default('2048') }}" creates="{{ credentials_dir }}/x509/dhparam" default_x509_ca_list:
- {tag: x509}
- name: Create X509 CA directory
local_action: file path="{{ credentials_dir }}/{{ item.tag }}" state=directory
loop: "{{ x509_ca_list | default(default_x509_ca_list) }}"
- name: Generate the X509 CA certificate - name: Generate the X509 CA certificate
local_action: x509_ca ca_subject="{{ x509_ca_subject | default('CN=Service CA') }}" ca_cert_path="{{ credentials_dir }}/x509/ca.pem" ca_key_path="{{ credentials_dir }}/x509/ca_private_key.pem" local_action: x509_ca ca_subject="{{ item.subject | default('CN=Service CA') }}" ca_cert_path="{{ credentials_dir }}/{{ item.tag }}/ca.pem" ca_key_path="{{ credentials_dir }}/{{ item.tag }}/ca_private_key.pem"
loop: "{{ x509_ca_list | default(default_x509_ca_list) }}"
- name: Generate global DH params
local_action: command openssl dhparam -out "{{ credentials_dir }}/x509/dhparam-{{ dhparam_bits | default('2048') }}" "{{ dhparam_bits | default('2048') }}" creates="{{ credentials_dir }}/x509/dhparam-{{ dhparam_bits | default('2048') }}"
# Prepare a SSH authorized_keys file content using float 'admins'.
from ansible.plugins.action import ActionBase
class ActionModule(ActionBase):
TRANSFERS_FILES = False
def run(self, tmp=None, task_vars=None):
admins = self._templar.template('{{ admins }}')
authorized_keys = []
# For each SSH key, add a comment with the owner's username.
for entry in admins:
username = entry['name']
if 'ssh_keys' not in entry:
continue
for key in entry['ssh_keys']:
key_without_comment = ' '.join(key.split()[:2])
key_with_comment = f'{key_without_comment} {username}\n'
authorized_keys.append(key_with_comment)
result = super(ActionModule, self).run(tmp, task_vars)
result['ansible_facts'] = {'float_authorized_keys': ''.join(authorized_keys)}
result['changed'] = False
return result
from ansible.plugins.action import ActionBase
TMPFS_FLAGS = 'tmpfs-mode=01777'
DEFAULT_TMPFS_SIZE = '64M'
class ActionModule(ActionBase):
TRANSFERS_FILES = False
# Options to set the container environment.
def _environment_options(self, service, container):
service_name = service['name']
hostname = self._templar.template('{{ inventory_hostname }}')
domain = self._templar.template('{{ domain }}')
env = {
'FLOAT_SERVICE': f'{service_name}.{domain}',
'FLOAT_INSTANCE_NAME': f'{hostname}.{service_name}.{domain}',
'FLOAT_CONTAINER_IMAGE': container['image'],
'FLOAT_CONTAINER_NAME': f'{service_name}-{container["name"]}',
}
if 'env' in container:
env.update(container['env'])
options = []
for key, value in sorted(env.items()):
options.append(f'--env={key}={value}')
return options
# Options for volumes (tmpfs, bind mounts).
def _mount_options(self, service, container):
options = []
add_tmpfs = True
def _bind(src, dst):
options.append(f'--mount=type=bind,source={src},destination={dst}')
def _tmpfs(dst, flags=None):
opt = f'--mount=type=tmpfs,destination={dst},{TMPFS_FLAGS}'
if flags:
opt += f',{flags}'
options.append(opt)
if container.get('readonly', True):
options.append('--read-only')
add_tmpfs = False
for vol in container.get('volumes', []):
for src, dst in sorted(vol.items()):
if dst == '/tmp':
add_tmpfs = False
if src == 'tmpfs':
_tmpfs(dst, f'tmpfs-size={DEFAULT_TMPFS_SIZE}')
elif src.startswith('tmpfs/'):
sz = src[6:]
_tmpfs(dst, f'tmpfs-size={sz}')
else:
_bind(src, dst)
_tmpfs('/run', 'tmpfs-size=16M,exec=true,notmpcopyup')
_bind('/dev/log', '/dev/log')
_bind('/etc/credentials/system', '/etc/ssl/certs')
if add_tmpfs:
_tmpfs('/tmp', f'tmpfs-size={DEFAULT_TMPFS_SIZE},notmpcopyup')
for creds in service.get('service_credentials', []):
creds_name = creds['name']
ca_tag = creds.get('ca_tag', 'x509')
creds_path = f'/etc/credentials/{ca_tag}/{creds_name}'
_bind(creds_path, creds_path)
return options
# Network options (ports).
def _network_options(self, container):
options = ['--network=host']
ports = []
if 'ports' in container:
ports = container['ports']
elif 'port' in container:
ports = [container['port']]
for port in sorted(ports):
options.append(f'--expose={port}')
return options
def run(self, tmp=None, task_vars=None):
service = self._task.args['service']
container = self._task.args['container']
options = []
options.extend(self._environment_options(service, container))
options.extend(self._mount_options(service, container))
options.extend(self._network_options(container))
is_root = container.get('root')
if container.get('drop_capabilities', not is_root):
options.append('--security-opt=no-new-privileges')
options.append('--cap-drop=all')
if 'docker_options' in container:
options.extend(container['docker_options'].split())
result = super().run(tmp, task_vars)
result['options'] = options
result['changed'] = False
return result
# Generate a host configuration file for tinc (fetching the public key
# from the remote host), and store the result in an Ansible fact.
from ansible.plugins.action import ActionBase
from ansible.errors import AnsibleFileNotFound
from ansible.module_utils._text import to_text
HOST_TEMPLATE = '''
Address = {{ ip }}
{% if ip6 is defined %}Address = {{ ip6 }}{% endif %}
Port = {{ tinc_config.port | default('655') }}
Cipher = {{ tinc_config.cipher | default('aes-128-cbc') }}
Digest = {{ tinc_config.digest | default('sha256') }}
Compression = {{ tinc_config.compression | default('0') }}
PMTU = {{ tinc_config.pmtu | default('1460') }}
Subnet = {{ tinc_host_subnet }}
{{ tinc_host_public_key }}
'''
class ActionModule(ActionBase):
TRANSFERS_FILES = False
def _cmd(self, task_vars, args, creates=None):
args = {
'_raw_params': ' '.join(args),
'creates': creates,
}
return self._execute_module(
module_name='command',
module_args=args,
task_vars=task_vars,
wrap_async=False)
def run(self, tmp=None, task_vars=None):
overlay = self._task.args['overlay']
subnet = self._templar.template('{{ ip_%s }}/32' % overlay)
# Find the overlay configuration by scanning the 'net_overlays'
# configuration variable, which is a list - it would be simpler with
# a dictionary.
net_overlays = self._templar.template('{{ net_overlays|default([]) }}')
overlay_config = {'name': overlay}
for n in net_overlays:
if n['name'] == overlay:
overlay_config = n
break
result = super(ActionModule, self).run(tmp, task_vars)
# Fetch the host public key.
pubkey = self._cmd(task_vars, [
'/bin/cat', '/etc/tinc/%s/rsa_key.pub' % overlay])['stdout']
if not pubkey:
result['failed'] = True
result['msg'] = "could not fetch host public key"
return result
# Generate the template, adding some custom variables of our own.
self._templar._available_variables['tinc_host_subnet'] = subnet
self._templar._available_variables['tinc_host_public_key'] = pubkey
self._templar._available_variables['tinc_config'] = overlay_config
data = self._templar.do_template(HOST_TEMPLATE,
preserve_trailing_newlines=True,
escape_backslashes=False)
result['ansible_facts'] = {'tinc_host_config': data}
result['changed'] = False
return result
...@@ -38,6 +38,9 @@ DEFAULT_SERVICE_CREDENTIALS = [ ...@@ -38,6 +38,9 @@ DEFAULT_SERVICE_CREDENTIALS = [
{ {
'name': 'auth-server', 'name': 'auth-server',
}, },
{
'name': 'assetmon-client',
},
] ]
...@@ -172,30 +175,31 @@ def _host_groups(name, inventory, assignments=None): ...@@ -172,30 +175,31 @@ def _host_groups(name, inventory, assignments=None):
# Return all host IP addresses for the specified overlay. # Return all host IP addresses for the specified overlay.
def _host_net_overlay_addrs(name, inventory, overlay): def _host_net_overlay_addrs(name, inventory, overlay):
if overlay == 'public': if overlay == 'public':
keys = ('ip', 'ip6') return inventory['hosts'][name]['public_ips']
else:
keys = ('ip_' + overlay,)
addrs = [] addrs = []
for k in keys: key = 'ip_' + overlay
v = inventory['hosts'][name].get(k) if key in inventory['hosts'][name]:
if v: addrs.append(inventory['hosts'][name][key])
addrs.append(v)
return addrs return addrs
# Return all host IP addresses, on all interfaces. # Return all host IP addresses, on all interfaces.
def _host_addrs(name, inventory): def _host_addrs(name, inventory):
return [ addrs = []
v for k, v in inventory['hosts'][name].items() for ip in inventory['hosts'][name]['ips']:
if k == 'ip' or k == 'ip6' or k.startswith('ip_')] addrs.append(ip)
for k, v in inventory['hosts'][name].items():
if k.startswith('ip_'):
addrs.append(v)
return addrs
def _host_dns_map(name, inventory): def _host_dns_map(name, inventory):
dns = {} dns = {}
dns[name] = inventory['hosts'][name]['ips']
for k, v in inventory['hosts'][name].items(): for k, v in inventory['hosts'][name].items():
if k == 'ip' or k == 'ip6': if k.startswith('ip_'):
dns.setdefault(name, []).append(v)
elif k.startswith('ip_'):
dns.setdefault(name + '.' + k[3:], []).append(v) dns.setdefault(name + '.' + k[3:], []).append(v)
return dns return dns
...@@ -278,6 +282,16 @@ def _global_dns_map(inventory): ...@@ -278,6 +282,16 @@ def _global_dns_map(inventory):
return dns return dns
# Return the hosts that are not available for scheduling, as a
# Python set.
def _unavailable_hosts(inventory):
unavail = set()
for name, values in inventory['hosts'].items():
if values.get('turndown'):
unavail.add(name)
return unavail
# Build a group -> hosts map out of an inventory. # Build a group -> hosts map out of an inventory.
def _build_group_map(inventory, assignments=None): def _build_group_map(inventory, assignments=None):
group_map = {} group_map = {}
...@@ -318,6 +332,7 @@ def _build_public_endpoints_map(services): ...@@ -318,6 +332,7 @@ def _build_public_endpoints_map(services):
'name': upstream_name, 'name': upstream_name,
'service_name': service_name, 'service_name': service_name,
'port': pe['port'], 'port': pe['port'],
'enable_api_proxy': pe.get('enable_api_proxy', False),
'enable_sso_proxy': pe.get('enable_sso_proxy', False), 'enable_sso_proxy': pe.get('enable_sso_proxy', False),
'sharded': pe.get('sharded', False), 'sharded': pe.get('sharded', False),
} }
...@@ -348,6 +363,14 @@ def _build_public_endpoints_map(services): ...@@ -348,6 +363,14 @@ def _build_public_endpoints_map(services):
return upstreams, endpoints return upstreams, endpoints
def _build_public_endpoint_port_map(services):
endpoints_by_port = {}
for svc in services.values():
for pe in svc.get('public_endpoints', []):
endpoints_by_port[pe['port']] = pe['name']
return endpoints_by_port
# Build the map of upstreams for 'horizontal' (well-known etc) HTTP # Build the map of upstreams for 'horizontal' (well-known etc) HTTP
# public endpoints. # public endpoints.
# #
...@@ -363,6 +386,7 @@ def _build_horizontal_upstreams_map(services): ...@@ -363,6 +386,7 @@ def _build_horizontal_upstreams_map(services):
'name': upstream_name, 'name': upstream_name,
'service_name': service_name, 'service_name': service_name,
'port': ep['port'], 'port': ep['port'],
'enable_api_proxy': False,
'enable_sso_proxy': False, 'enable_sso_proxy': False,
'sharded': False, 'sharded': False,
} }
...@@ -487,7 +511,10 @@ class Assignments(object): ...@@ -487,7 +511,10 @@ class Assignments(object):
return str(self._fwd) return str(self._fwd)
@classmethod @classmethod
def _available_hosts(cls, service, group_map): def _available_hosts(cls, service, group_map, service_hosts_map,
unavailable_hosts={}):
if 'schedule_with' in service:
return service_hosts_map[service['schedule_with']]
scheduling_groups = ['all'] scheduling_groups = ['all']
if 'scheduling_group' in service: if 'scheduling_group' in service:
scheduling_groups = [service['scheduling_group']] scheduling_groups = [service['scheduling_group']]
...@@ -495,8 +522,10 @@ class Assignments(object): ...@@ -495,8 +522,10 @@ class Assignments(object):
scheduling_groups = service['scheduling_groups'] scheduling_groups = service['scheduling_groups']
available_hosts = set() available_hosts = set()
for g in scheduling_groups: for g in scheduling_groups:
if g not in group_map:
raise Exception(f'The scheduling_group "{g}" is not defined in inventoy')
available_hosts.update(group_map[g]) available_hosts.update(group_map[g])
return list(available_hosts) return list(available_hosts.difference(unavailable_hosts))
@classmethod @classmethod
def schedule(cls, services, inventory): def schedule(cls, services, inventory):
...@@ -509,24 +538,32 @@ class Assignments(object): ...@@ -509,24 +538,32 @@ class Assignments(object):
""" """
service_hosts_map = {} service_hosts_map = {}
service_master_map = {} service_master_map = {}
unavailable_hosts = _unavailable_hosts(inventory)
group_map = _build_group_map(inventory) group_map = _build_group_map(inventory)
host_occupation = collections.defaultdict(int) host_occupation = collections.defaultdict(int)
# Iterations should happen over sorted items for reproducible # Iterations should happen over sorted items for reproducible
# results. The sort function combines the 'scheduling_order' # results. The sort function combines the 'scheduling_order'
# attribute (default -1) and the service name. # attribute (default -1), the presence of the 'schedule_with'
# attribute, and the service name.
def _sort_key(service_name): def _sort_key(service_name):
return (services[service_name].get('scheduling_order', -1), service_name) return (services[service_name].get('scheduling_order', -1),
1 if 'schedule_with' in services[service_name] else 0,
service_name)
for service_name in sorted(services.keys(), key=_sort_key): for service_name in sorted(services.keys(), key=_sort_key):
service = services[service_name] service = services[service_name]
available_hosts = cls._available_hosts(service, group_map) available_hosts = cls._available_hosts(service, group_map,
service_hosts_map,
unavailable_hosts)
num_instances = service.get('num_instances', 'all') num_instances = service.get('num_instances', 'all')
if num_instances == 'all': if num_instances == 'all':
service_hosts = sorted(available_hosts) service_hosts = sorted(available_hosts)
else: else:
service_hosts = sorted(_binpack( service_hosts = sorted(_binpack(
available_hosts, host_occupation, num_instances)) available_hosts, host_occupation, num_instances))
if not service_hosts:
raise Exception(f'No hosts available to schedule service {service_name}')
service_hosts_map[service_name] = service_hosts service_hosts_map[service_name] = service_hosts
for h in service_hosts: for h in service_hosts:
host_occupation[h] += 1 host_occupation[h] += 1
...@@ -551,10 +588,34 @@ def _any_attribute_set(services, attr): ...@@ -551,10 +588,34 @@ def _any_attribute_set(services, attr):
return False return False
# Pre-process inventory entries, to normalize host variables and
# provide defaults (thus simplifying the jinja template logic).
def _preprocess_inventory(inventory):
for host in inventory['hosts'].values():
# Set 'ips' if the legacy variables are set.
if 'ips' not in host:
host['ips'] = []
if 'ip' in host:
host['ips'].append(host['ip'])
if 'ip6' in host:
host['ips'].append(host['ip6'])
# Same for 'public_ips'.
if 'public_ips' not in host:
host['public_ips'] = []
if 'public_ip' in host:
host['public_ips'].append(host['public_ip'])
if 'public_ip6' in host:
host['public_ips'].append(host['public_ip6'])
# Default public_ips to ips.
if not host['public_ips']:
host['public_ips'] = host['ips']
# Run the scheduler, and return inventory and groups for Ansible. # Run the scheduler, and return inventory and groups for Ansible.
def run_scheduler(config): def run_scheduler(config):
services = config['services'] services = config['services']
inventory = config['inventory'] inventory = config['inventory']
_preprocess_inventory(inventory)
assignments = Assignments.schedule(services, inventory) assignments = Assignments.schedule(services, inventory)
# Augment all data structures with autogenerated and # Augment all data structures with autogenerated and
...@@ -583,6 +644,7 @@ def run_scheduler(config): ...@@ -583,6 +644,7 @@ def run_scheduler(config):
# The following variables are just used for debugging purposes (dashboards). # The following variables are just used for debugging purposes (dashboards).
'float_service_assignments': assignments._fwd, 'float_service_assignments': assignments._fwd,
'float_service_masters': assignments._masters, 'float_service_masters': assignments._masters,
'float_http_endpoints_by_port': _build_public_endpoint_port_map(services),
}) })
# Set the HTTP frontend configuration on the 'frontend' group. # Set the HTTP frontend configuration on the 'frontend' group.
...@@ -590,8 +652,6 @@ def run_scheduler(config): ...@@ -590,8 +652,6 @@ def run_scheduler(config):
inventory['group_vars'].setdefault('frontend', {}).update({ inventory['group_vars'].setdefault('frontend', {}).update({
'float_enable_http_frontend': _any_attribute_set( 'float_enable_http_frontend': _any_attribute_set(
services, 'public_endpoints'), services, 'public_endpoints'),
'float_enable_tcp_frontend': _any_attribute_set(
services, 'public_tcp_endpoints'),
'float_http_upstreams': http_upstreams, 'float_http_upstreams': http_upstreams,
'float_http_endpoints': http_endpoints, 'float_http_endpoints': http_endpoints,
}) })
......
--- ---
{% set all_ips = hostvars.values() | rejectattr('ips', 'undefined') | map(attribute='ips') | flatten | sort %}
backends_dir: /etc/auth-server/backends.d backends_dir: /etc/auth-server/backends.d
services_dir: /etc/auth-server/services.d services_dir: /etc/auth-server/services.d
services: {} services: {}
webauthn:
rp_id: "{{ webauthn_rp_id }}"
rp_origin: "https://{{ webauthn_rp_id }}"
rp_display_name: "{{ webauthn_rp_display_name | default(webauthn_rp_id) }}"
{% if 'user-meta-server' in services %} {% if 'user-meta-server' in services %}
user_meta_server: user_meta_server:
url: "https://user-meta-server.{{ domain }}:5505" url: "https://user-meta-server.{{ domain }}:5505"
...@@ -25,9 +32,9 @@ rate_limits: ...@@ -25,9 +32,9 @@ rate_limits:
value: "127.0.0.1" value: "127.0.0.1"
- key: ip - key: ip
value: "::1" value: "::1"
{% for h in groups['all'] | sort %} {% for ip in all_ips %}
- key: ip - key: ip
value: "{{ hostvars[h]['ip'] }}" value: "{{ ip }}"
{% endfor %} {% endfor %}
# Per-IP rate limiter specific to account recovery, with stricter limits. # Per-IP rate limiter specific to account recovery, with stricter limits.
...@@ -40,9 +47,9 @@ rate_limits: ...@@ -40,9 +47,9 @@ rate_limits:
value: "127.0.0.1" value: "127.0.0.1"
- key: ip - key: ip
value: "::1" value: "::1"
{% for h in groups['all'] | sort %} {% for ip in all_ips %}
- key: ip - key: ip
value: "{{ hostvars[h]['ip'] }}" value: "{{ ip }}"
{% endfor %} {% endfor %}
# Blacklist users with too many failed account recovery attempts. # Blacklist users with too many failed account recovery attempts.
...@@ -73,9 +80,9 @@ rate_limits: ...@@ -73,9 +80,9 @@ rate_limits:
value: "127.0.0.1" value: "127.0.0.1"
- key: ip - key: ip
value: "::1" value: "::1"
{% for h in groups['all'] | sort %} {% for ip in all_ips %}
- key: ip - key: ip
value: "{{ hostvars[h]['ip'] }}" value: "{{ ip }}"
{% endfor %} {% endfor %}
{% if 'auth-cache' in services %} {% if 'auth-cache' in services %}
......