Skip to content
Snippets Groups Projects
  • ale's avatar
    c5ec7eb8
    Add multi-file output · c5ec7eb8
    ale authored
    The output stage can now write to size-limited, rotating WARC files
    using a user-specified pattern, so that output files are always
    unique.
    c5ec7eb8
    History
    Add multi-file output
    ale authored
    The output stage can now write to size-limited, rotating WARC files
    using a user-specified pattern, so that output files are always
    unique.
multi.go 2.03 KiB
package warc

import (
	"bufio"
	"encoding/binary"
	"encoding/hex"
	"fmt"
	"io"
	"os"
	"sync/atomic"
	"time"
)

func newID() string {
	var b [8]byte
	binary.BigEndian.PutUint64(b[:], uint64(time.Now().UnixNano()))
	return hex.EncodeToString(b[:])
}

type meteredWriter struct {
	io.WriteCloser
	bytes uint64
}

func (m *meteredWriter) Write(b []byte) (int, error) {
	n, err := m.WriteCloser.Write(b)
	if n > 0 {
		atomic.AddUint64(&m.bytes, uint64(n))
	}
	return n, err
}

func (m *meteredWriter) Bytes() uint64 {
	return atomic.LoadUint64(&m.bytes)
}

type bufferedWriter struct {
	*bufio.Writer
	io.Closer
}

func newBufferedWriter(w io.WriteCloser) *bufferedWriter {
	return &bufferedWriter{
		Writer: bufio.NewWriter(w),
		Closer: w,
	}
}

func (w *bufferedWriter) Close() error {
	if err := w.Writer.Flush(); err != nil {
		return err
	}
	return w.Closer.Close()
}

func openFile(path string) (*meteredWriter, error) {
	f, err := os.Create(path)
	if err != nil {
		return nil, err
	}
	return &meteredWriter{WriteCloser: newBufferedWriter(f)}, nil
}

// Unsafe for concurrent access.
type multiWriter struct {
	pattern string
	maxSize uint64

	cur *meteredWriter
}

func newMultiWriter(pattern string, maxSize uint64) rawWriter {
	if maxSize == 0 {
		maxSize = 100 * 1024 * 1024
	}
	return &multiWriter{
		pattern: pattern,
		maxSize: maxSize,
	}
}

func (w *multiWriter) newFilename() string {
	return fmt.Sprintf(w.pattern, newID())
}

func (w *multiWriter) NewRecord() (err error) {
	if w.cur == nil || w.cur.Bytes() > w.maxSize {
		if w.cur != nil {
			if err = w.cur.Close(); err != nil {
				return
			}
		}
		w.cur, err = openFile(w.newFilename())
	}
	return
}

func (w *multiWriter) Write(b []byte) (int, error) {
	return w.cur.Write(b)
}

func (w *multiWriter) Close() error {
	return w.cur.Close()
}

type simpleWriter struct {
	*bufferedWriter
}

func newSimpleWriter(w io.WriteCloser) rawWriter {
	return &simpleWriter{newBufferedWriter(w)}
}

func (w *simpleWriter) NewRecord() error {
	return nil
}

type rawWriter interface {
	io.WriteCloser
	NewRecord() error
}