Commit b35324a5 authored by ale's avatar ale

Add status handlers, an overrun policy manager for scheduled jobs

parent a7b33a75
......@@ -119,3 +119,10 @@ type Repository interface {
RestoreStream(context.Context, Backup, Dataset, string, io.Writer) error
Close() error
}
// Manager for backups and restores.
type Manager interface {
Backup(context.Context, []SourceSpec) (Backup, error)
Restore(context.Context, FindRequest, string) error
Close() error
}
......@@ -8,55 +8,104 @@ import (
"time"
)
// ExecOp is an operation (a func(Context) error) that can be run
// Op is an operation (a func(Context) error) that can be run
// asynchronously by a worker thread and waited for.
type ExecOp struct {
id string
fn func(context.Context) error
timeout time.Duration
done chan struct{}
err error
type Op struct {
id string
desc string
fn func(context.Context) error
timeout time.Duration
startedAt time.Time
done chan struct{}
err error
}
// NewExecOp wraps a function in an ExecOp (with timeout).
func NewExecOp(fn func(context.Context) error, timeout time.Duration) *ExecOp {
return &ExecOp{
// NewOp wraps a function in an Op (with timeout).
func NewOp(desc string, fn func(context.Context) error, timeout time.Duration) *Op {
return &Op{
id: randomID(),
desc: desc,
fn: fn,
timeout: timeout,
done: make(chan struct{}),
}
}
func (op *ExecOp) run(ctx context.Context) {
func (op *Op) run(ctx context.Context) {
ctx, cancel := context.WithTimeout(ctx, op.timeout)
op.startedAt = time.Now()
op.err = op.fn(ctx)
cancel()
close(op.done)
}
// Wait for the operation to run, return its error.
func (op *ExecOp) Wait() error {
func (op *Op) Wait() error {
<-op.done
return op.err
}
// executorStatus holds the currently running jobs, for debugging.
type executorStatus struct {
mx sync.Mutex
running map[string]*Op
failed []Op
}
func newExecutorStatus() *executorStatus {
return &executorStatus{
running: make(map[string]*Op),
}
}
func (s *executorStatus) push(op *Op) {
s.mx.Lock()
s.running[op.id] = op
s.mx.Unlock()
}
const maxFailedLength = 20
func (s *executorStatus) pop(op *Op) {
s.mx.Lock()
delete(s.running, op.id)
if op.err != nil {
s.failed = append(s.failed, *op)
if n := len(s.failed); n > maxFailedLength {
s.failed = s.failed[n-maxFailedLength : n]
}
}
s.mx.Unlock()
}
func (s *executorStatus) getStatus() ([]Op, []Op) {
var out []Op
s.mx.Lock()
defer s.mx.Unlock()
for _, op := range s.running {
out = append(out, *op)
}
return out, s.failed
}
// Executor is a task scheduler that keeps a two-level priority queue
// (for high-priority and low-priority tasks respectively).
type Executor struct {
queueCh chan struct{}
cancel context.CancelFunc
wg sync.WaitGroup
running *executorStatus
mx sync.Mutex
queueHigh []*ExecOp
queueLow []*ExecOp
queueHigh []*Op
queueLow []*Op
}
// newUnstartedExecutor returns an executor without starting the worker threads.
func newUnstartedExecutor() *Executor {
return &Executor{
queueCh: make(chan struct{}, 1),
running: newExecutorStatus(),
}
}
......@@ -90,7 +139,9 @@ func (e *Executor) runWorker(ctx context.Context) {
}
log.Printf("executing op %s", op.id)
e.running.push(op)
op.run(ctx)
e.running.pop(op)
}
}
......@@ -101,13 +152,13 @@ func (e *Executor) Stop() {
}
// Enqueue an operation (possibly with high priority).
func (e *Executor) Enqueue(op *ExecOp, hiPri bool) {
e.EnqueueBatch([]*ExecOp{op}, hiPri)
func (e *Executor) Enqueue(op *Op, hiPri bool) {
e.EnqueueBatch([]*Op{op}, hiPri)
}
// EnqueueBatch schedules a batch of operations all at once (possibly
// with high priority). Easier on lock contention.
func (e *Executor) EnqueueBatch(b []*ExecOp, hiPri bool) {
func (e *Executor) EnqueueBatch(b []*Op, hiPri bool) {
e.mx.Lock()
if hiPri {
e.queueHigh = append(e.queueHigh, b...)
......@@ -124,7 +175,7 @@ func (e *Executor) EnqueueBatch(b []*ExecOp, hiPri bool) {
// Pop an element from the queue (if it has to sleep, it can be
// interrupted by closing stopCh).
func (e *Executor) Pop(ctx context.Context) (op *ExecOp, err error) {
func (e *Executor) Pop(ctx context.Context) (op *Op, err error) {
e.mx.Lock()
for len(e.queueHigh) == 0 && len(e.queueLow) == 0 {
e.mx.Unlock()
......@@ -146,3 +197,9 @@ func (e *Executor) Pop(ctx context.Context) (op *ExecOp, err error) {
e.mx.Unlock()
return
}
// GetStatus returns the running jobs, and a short list of recent
// failures.
func (e *Executor) GetStatus() ([]Op, []Op) {
return e.running.getStatus()
}
......@@ -16,9 +16,9 @@ func TestExecutor_Priority(t *testing.T) {
// Run 10 ops at low priority and 1 at high, and verify that
// the latter gets invoked first.
var ops []*ExecOp
var ops []*Op
for i := 0; i < 10; i++ {
op := NewExecOp(func(_ context.Context) error {
op := NewOp("lo", func(_ context.Context) error {
mx.Lock()
defer mx.Unlock()
lowDone = true
......@@ -27,7 +27,7 @@ func TestExecutor_Priority(t *testing.T) {
ops = append(ops, op)
}
e.EnqueueBatch(ops, false)
hiOp := NewExecOp(func(_ context.Context) error {
hiOp := NewOp("hi", func(_ context.Context) error {
mx.Lock()
defer mx.Unlock()
if lowDone {
......@@ -49,3 +49,28 @@ func TestExecutor_Priority(t *testing.T) {
t.Fatalf("hi-prio op error: %v", err)
}
}
func TestExecutor_GetStatus(t *testing.T) {
e := newUnstartedExecutor()
op := NewOp("fail", func(_ context.Context) error {
return errors.New("failure")
}, 10*time.Second)
e.Enqueue(op, false)
e.start(1)
defer e.Stop()
op.Wait()
running, failures := e.GetStatus()
if len(running) != 0 {
t.Fatalf("op still reported as running: %v", running)
}
if len(failures) != 1 {
t.Fatalf("too many/few failures: %v", failures)
}
failedOp := failures[0]
if op.id != failedOp.id {
t.Fatalf("failed op is not original op? got %+v, expected %+v", failedOp, op)
}
}
......@@ -2,20 +2,17 @@ package tabacco
import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"log"
"os"
"strings"
"time"
)
var defaultOpTimeout = 6 * time.Hour
// Manager for backups and restores.
type Manager struct {
type tabaccoManager struct {
handlerMap map[string]Handler
repo Repository
ms MetadataStore
......@@ -24,13 +21,13 @@ type Manager struct {
}
// NewManager creates a new Manager.
func NewManager(ctx context.Context, handlerSpecs []HandlerSpec, repoSpec RepositorySpec, ms MetadataStore, exec *Executor, shell *Shell) (*Manager, error) {
func NewManager(ctx context.Context, handlerSpecs []HandlerSpec, repoSpec RepositorySpec, ms MetadataStore, exec *Executor, shell *Shell) (Manager, error) {
handlerMap, repo, err := parseSpecs(ctx, handlerSpecs, repoSpec, shell)
if err != nil {
return nil, err
}
return &Manager{
return &tabaccoManager{
handlerMap: handlerMap,
repo: repo,
ms: ms,
......@@ -41,15 +38,15 @@ func NewManager(ctx context.Context, handlerSpecs []HandlerSpec, repoSpec Reposi
// Close the Manager and free all associated resources (those owned by
// this object).
func (m *Manager) Close() error {
func (m *tabaccoManager) Close() error {
return m.repo.Close()
}
// Prepare the repository for a new backup. This is a synchronous
// operation: we need to wait for it to complete to avoid running the
// backup tasks too soon.
func (m *Manager) prepareBackup(ctx context.Context, backup Backup) error {
op := NewExecOp(func(ctx context.Context) error {
func (m *tabaccoManager) prepareBackup(ctx context.Context, backup Backup) error {
op := NewOp("prepare repository", func(ctx context.Context) error {
if err := m.repo.Init(ctx); err != nil {
log.Printf("repository init failed: %v", err)
return err
......@@ -62,7 +59,7 @@ func (m *Manager) prepareBackup(ctx context.Context, backup Backup) error {
}
// Backup all known sources to the configured destination.
func (m *Manager) Backup(ctx context.Context, sourceSpecs []SourceSpec) (Backup, error) {
func (m *tabaccoManager) Backup(ctx context.Context, sourceSpecs []SourceSpec) (Backup, error) {
// Parse the source specs and obtain Datasets. Errors here are
// logged but *not* fatal, unless there are errors and the
// list of non-erroring sources is nil.
......@@ -81,12 +78,12 @@ func (m *Manager) Backup(ctx context.Context, sourceSpecs []SourceSpec) (Backup,
}
// Run all backup tasks, scheduling them using the executor.
var ops []*ExecOp
var ops []*Op
for _, ds := range datasets {
// Bind 'ds' to the closure via an anonymous
// function. Required because of the loop.
func(ds Dataset) {
ops = append(ops, NewExecOp(func(ctx context.Context) (err error) {
ops = append(ops, NewOp(fmt.Sprintf("backup dataset %s", ds.Name), func(ctx context.Context) (err error) {
h, ok := m.handlerMap[ds.Handler]
if !ok {
return fmt.Errorf("%s: unknown handler '%s'", ds.Name, ds.Handler)
......@@ -117,17 +114,18 @@ func (m *Manager) Backup(ctx context.Context, sourceSpecs []SourceSpec) (Backup,
// Restore the results of the FindRequest (with NumVersions=1) onto
// the given target directory.
func (m *Manager) Restore(ctx context.Context, req FindRequest, target string) error {
func (m *tabaccoManager) Restore(ctx context.Context, req FindRequest, target string) error {
// Find the atoms relevant to this restore.
req.NumVersions = 1
versions, err := m.ms.FindAtoms(ctx, req)
if err != nil {
return err
}
var ops []*ExecOp
var ops []*Op
for _, vv := range versions {
func(ds Dataset, backup Backup) {
ops = append(ops, NewExecOp(func(ctx context.Context) error {
ops = append(ops, NewOp(fmt.Sprintf("restore dataset %s", ds.Name), func(ctx context.Context) error {
log.Printf("restoring %+v %+v", ds, backup)
h, ok := m.handlerMap[ds.Handler]
if !ok {
......@@ -148,49 +146,11 @@ func (m *Manager) Restore(ctx context.Context, req FindRequest, target string) e
return merr.orNil()
}
type multiError struct {
errors []error
}
func (m *multiError) Add(err error) {
m.errors = append(m.errors, err)
}
func (m *multiError) Error() string {
var tmp []string
for _, e := range m.errors {
tmp = append(tmp, e.Error())
}
return strings.Join(tmp, ", ")
}
func (m *multiError) Errors() []error {
return m.errors
}
func (m *multiError) orNil() error {
if len(m.errors) > 0 {
return m
}
return nil
}
// Generate a random unique ID. It will return an identifier
// consisting of 32 ascii-friendly bytes (16 random bytes,
// hex-encoded).
func randomID() string {
var b [16]byte
if _, err := rand.Read(b[:]); err != nil {
panic(err)
}
return hex.EncodeToString(b[:])
}
// Create a new Backup object with its own unique ID (which actually
// consists of 16 random bytes, hex-encoded).
func newBackup(host string) Backup {
if host == "" {
host, _ = os.Hostname()
host, _ = os.Hostname() // nolint
}
return Backup{
ID: randomID(),
......
......@@ -54,11 +54,11 @@ func newResticRepository(params map[string]interface{}, shell *Shell) (Repositor
return nil, err
}
if _, err := io.WriteString(tmpf, password); err != nil {
os.Remove(tmpf.Name()) // nolint: errcheck
os.Remove(tmpf.Name()) // nolint
return nil, err
}
if err := tmpf.Close(); err != nil {
os.Remove(tmpf.Name()) // nolint: errcheck
os.Remove(tmpf.Name()) // nolint
return nil, err
}
......
......@@ -2,19 +2,116 @@ package tabacco
import (
"context"
crand "crypto/rand"
"encoding/binary"
"fmt"
"io/ioutil"
"log"
"math/rand"
"sort"
"strings"
"sync"
"time"
"github.com/robfig/cron"
)
// The scheduler runs backup jobs periodically, according to the
// The overrunPolicy decides what to do with jobs that run over their
// schedule period. If killOverrun is true, when the next scheduled
// job starts the previous one will be killed, otherwise the new job
// will be skipped. String keys are used to identify identical jobs.
//
// To do this, we create cancelable contexts for each job, and keep
// track of which ones are running: so the wrapped function must
// terminate when the context is canceled.
type overrunPolicy struct {
killOverrun bool
mx sync.Mutex
running map[string]context.CancelFunc
waiting map[string]chan struct{}
}
func newOverrunPolicy(killOverrun bool) *overrunPolicy {
return &overrunPolicy{
killOverrun: killOverrun,
running: make(map[string]context.CancelFunc),
waiting: make(map[string]chan struct{}),
}
}
func (p *overrunPolicy) killAndWaitLocked(key string) {
c := make(chan struct{})
p.waiting[key] = c
p.mx.Unlock()
<-c
p.mx.Lock()
}
func (p *overrunPolicy) run(ctx context.Context, job jobContext) {
key := job.Key()
p.mx.Lock()
defer p.mx.Unlock()
if cancel, ok := p.running[key]; ok {
if p.killOverrun {
log.Printf("sched: job %s is overrun, killing", key)
cancel()
p.killAndWaitLocked(key)
} else {
log.Printf("sched: job %s is overrun, skipping", key)
return
}
}
fctx, cancel := context.WithCancel(ctx)
p.running[key] = cancel
p.mx.Unlock()
job.RunContext(fctx)
p.mx.Lock()
delete(p.running, key)
if c, ok := p.waiting[key]; ok {
delete(p.waiting, key)
close(c)
}
}
func (p *overrunPolicy) wrapJob(ctx context.Context, job jobContext) cron.Job {
return &policyJob{
wrapped: job,
parentCtx: ctx,
policy: p,
}
}
type policyJob struct {
wrapped jobContext
parentCtx context.Context
policy *overrunPolicy
}
func (j *policyJob) Run() {
j.policy.run(j.parentCtx, j.wrapped)
}
func (j *policyJob) WrappedJob() jobContext {
return j.wrapped
}
type wrappedJob interface {
cron.Job
WrappedJob() jobContext
}
type jobContext interface {
RunContext(context.Context)
Key() string
}
// The Scheduler runs backup jobs periodically, according to the
// schedule specified in the source spec.
//
// The standard cron syntax (documentation available at
......@@ -27,23 +124,28 @@ import (
// a random offset within the given period. The offset stays constant
// over time because the random seed it's generated from is saved in a
// file.
type scheduler struct {
c *cron.Cron
}
type Scheduler struct {
c *cron.Cron
policy *overrunPolicy
func runScheduledBackup(ctx context.Context, m *Manager, spec SourceSpec) {
log.Printf("running scheduled backup for source %s", spec.Name)
backup, err := m.Backup(ctx, []SourceSpec{spec})
if err != nil {
log.Printf("scheduled backup for source %s failed: %v", spec.Name, err)
} else {
log.Printf("scheduled backup for source %s succeeded (backup id %s)", spec.Name, backup.ID)
}
logMx sync.Mutex
logs map[string][]JobStatus
}
func newScheduler(ctx context.Context, m *Manager, sourceSpecs []SourceSpec) (*scheduler, error) {
rnd := mustGetRand("/var/tmp/.tabacco_scheduler_seed")
// NewScheduler creates a new Scheduler.
func NewScheduler(ctx context.Context, m Manager, sourceSpecs []SourceSpec, seedFile string) (*Scheduler, error) {
if seedFile == "" {
seedFile = "/var/tmp/.tabacco_scheduler_seed"
}
rnd := mustGetRand(seedFile)
c := cron.New()
policy := newOverrunPolicy(false)
s := &Scheduler{
c: c,
policy: policy,
logs: make(map[string][]JobStatus),
}
merr := new(multiError)
for _, spec := range sourceSpecs {
if spec.Schedule != "" {
......@@ -52,12 +154,165 @@ func newScheduler(ctx context.Context, m *Manager, sourceSpecs []SourceSpec) (*s
merr.Add(fmt.Errorf("%s: bad schedule: %v", spec.Name, err))
continue
}
c.Schedule(sched, cron.FuncJob(func() {
runScheduledBackup(ctx, m, spec)
}))
c.Schedule(sched, policy.wrapJob(ctx, newScheduledBackupJob(s, m, spec)))
}
}
return &scheduler{c}, merr.orNil()
c.Start()
return s, merr.orNil()
}
// Stop the scheduler (won't affect running jobs).
func (s *Scheduler) Stop() {
s.c.Stop()
}
const logsToKeep = 20
func (s *Scheduler) addLog(status JobStatus) {
s.logMx.Lock()
l := s.logs[status.Name]
l = append(l, status)
if len(l) > logsToKeep {
l = l[len(l)-logsToKeep:]
}
s.logs[status.Name] = l
s.logMx.Unlock()
}
// JobStatus represents the status of a job, either scheduled,
// running, or terminated in the past.
type JobStatus struct {
Name string `json:"name"`
Running bool `json:"running"`
Schedule string `json:"schedule"`
Prev time.Time `json:"prev,omitempty"`
Next time.Time `json:"next,omitempty"`
RunAt time.Time `json:"run_at,omitempty"`
Error string `json:"error,omitempty"`
}
type jobStatusList struct {
list []JobStatus
lessFn func(int, int) bool
}
func (l *jobStatusList) Len() int { return len(l.list) }
func (l *jobStatusList) Swap(i, j int) { l.list[i], l.list[j] = l.list[j], l.list[i] }
func (l *jobStatusList) Less(i, j int) bool { return l.lessFn(i, j) }
func jobStatusListOrderByName(list []JobStatus) *jobStatusList {
return &jobStatusList{
list: list,
lessFn: func(i, j int) bool {
return list[i].Name < list[j].Name
},
}
}
func jobStatusListOrderByRunTime(list []JobStatus) *jobStatusList {
return &jobStatusList{
list: list,
lessFn: func(i, j int) bool {
return list[i].RunAt.After(list[j].RunAt)
},
}
}
// SchedulerStatus holds information about the scheduler state, and
// the past executions.
type SchedulerStatus struct {
Scheduled []JobStatus `json:"scheduled"`
Completed []JobStatus `json:"completed"`
}
// GetStatus returns the current status of the scheduled jobs.
func (s *Scheduler) GetStatus() *SchedulerStatus {
status := new(SchedulerStatus)
// Get status of scheduled jobs.
for _, entry := range s.c.Entries() {
if jobw, ok := entry.Job.(wrappedJob); ok {
if job, ok := jobw.WrappedJob().(*scheduledBackupJob); ok {
status.Scheduled = append(status.Scheduled, JobStatus{
Name: job.spec.Name,
Running: job.isRunning(),
Schedule: job.spec.Schedule,
Prev: entry.Prev,
Next: entry.Next,
})
}
}
}
// Get (all!) logs of past executions.
s.logMx.Lock()
for _, ll := range s.logs {
status.Completed = append(status.Completed, ll...)