Commit d7c0de28 by ale

Add vendor dependencies

1 parent 7c9a51e5
Pipeline #161 passed
in 11 seconds
The MIT License (MIT)
Copyright (c) [year] [fullname]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
[![Build Status](https://travis-ci.org/clarkduvall/hyperloglog.svg?branch=master)](https://travis-ci.org/clarkduvall/hyperloglog) [![Coverage Status](https://img.shields.io/coveralls/clarkduvall/hyperloglog.svg)](https://coveralls.io/r/clarkduvall/hyperloglog?branch=master) [![GoDoc](https://godoc.org/github.com/clarkduvall/hyperloglog?status.svg)](http://godoc.org/github.com/clarkduvall/hyperloglog)
# HyperLogLog and HyperLogLog++
Implements the HyperLogLog and HyperLogLog++ algorithms.
HyperLogLog paper: http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
HyperLogLog++ paper: http://research.google.com/pubs/pub40671.html
## Documentation
Documentation can be found [here](http://godoc.org/github.com/clarkduvall/hyperloglog).
## Comparison of Algorithms
The HyperLogLog++ algorithm has much lower error for small cardinalities. This
is because it uses a different representation of data for small sets of data.
Data generated using this library shows the difference for N < 10000:
![N < 10000](10000.png)
HyperLogLog++ also has bias correction which helps offset estimation errors in
the original HyperLogLog algorithm. This correction can be seen here, again
using data generated using this library:
![N < 80000](80000.png)
## Future Improvements
- Right now HLL++ uses 8 bits per register. It could use 6 bits and take less
memory.
- The list compression algorithm could be improved, allowing the sparse
representation to be used longer.
package hyperloglog
import "math"
type Hash32 interface {
Sum32() uint32
}
type Hash64 interface {
Sum64() uint64
}
type sortableSlice []uint32
func (p sortableSlice) Len() int { return len(p) }
func (p sortableSlice) Less(i, j int) bool { return p[i] < p[j] }
func (p sortableSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
type set map[uint32]bool
func (s set) Add(i uint32) { s[i] = true }
func alpha(m uint32) float64 {
if m == 16 {
return 0.673
} else if m == 32 {
return 0.697
} else if m == 64 {
return 0.709
}
return 0.7213 / (1 + 1.079/float64(m))
}
var clzLookup = []uint8{
32, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
}
// This optimized clz32 algorithm is from:
// http://embeddedgurus.com/state-space/2014/09/
// fast-deterministic-and-portable-counting-leading-zeros/
func clz32(x uint32) uint8 {
var n uint8
if x >= (1 << 16) {
if x >= (1 << 24) {
if x >= (1 << 28) {
n = 28
} else {
n = 24
}
} else {
if x >= (1 << 20) {
n = 20
} else {
n = 16
}
}
} else {
if x >= (1 << 8) {
if x >= (1 << 12) {
n = 12
} else {
n = 8
}
} else {
if x >= (1 << 4) {
n = 4
} else {
n = 0
}
}
}
return clzLookup[x>>n] - n
}
func clz64(x uint64) uint8 {
var c uint8
for m := uint64(1 << 63); m&x == 0 && m != 0; m >>= 1 {
c++
}
return c
}
// Extract bits from uint32 using LSB 0 numbering, including lo
func eb32(bits uint32, hi uint8, lo uint8) uint32 {
m := uint32(((1 << (hi - lo)) - 1) << lo)
return (bits & m) >> lo
}
// Extract bits from uint64 using LSB 0 numbering, including lo
func eb64(bits uint64, hi uint8, lo uint8) uint64 {
m := uint64(((1 << (hi - lo)) - 1) << lo)
return (bits & m) >> lo
}
func linearCounting(m uint32, v uint32) float64 {
fm := float64(m)
return fm * math.Log(fm/float64(v))
}
func countZeros(s []uint8) uint32 {
var c uint32
for _, v := range s {
if v == 0 {
c++
}
}
return c
}
func calculateEstimate(s []uint8) float64 {
sum := 0.0
for _, val := range s {
sum += 1.0 / float64(uint64(1)<<val)
}
m := uint32(len(s))
fm := float64(m)
return alpha(m) * fm * fm / sum
}
package hyperloglog
type iterable interface {
decode(i int, last uint32) (uint32, int)
Len() int
Iter() *iterator
}
type iterator struct {
i int
last uint32
v iterable
}
func (iter *iterator) Next() uint32 {
n, i := iter.v.decode(iter.i, iter.last)
iter.last = n
iter.i = i
return n
}
func (iter *iterator) Peek() uint32 {
n, _ := iter.v.decode(iter.i, iter.last)
return n
}
func (iter iterator) HasNext() bool {
return iter.i < iter.v.Len()
}
type compressedList struct {
Count uint32
b variableLengthList
last uint32
}
func newCompressedList(size int) *compressedList {
v := &compressedList{}
v.b = make(variableLengthList, 0, size)
return v
}
func (v *compressedList) Len() int {
return len(v.b)
}
func (v *compressedList) decode(i int, last uint32) (uint32, int) {
n, i := v.b.decode(i, last)
return n + last, i
}
func (v *compressedList) Append(x uint32) {
v.Count++
v.b = v.b.Append(x - v.last)
v.last = x
}
func (v *compressedList) Iter() *iterator {
return &iterator{0, 0, v}
}
type variableLengthList []uint8
func (v variableLengthList) Len() int {
return len(v)
}
func (v *variableLengthList) Iter() *iterator {
return &iterator{0, 0, v}
}
func (v variableLengthList) decode(i int, last uint32) (uint32, int) {
var x uint32
j := i
for ; v[j]&0x80 != 0; j++ {
x |= uint32(v[j]&0x7f) << (uint(j-i) * 7)
}
x |= uint32(v[j]) << (uint(j-i) * 7)
return x, j + 1
}
func (v variableLengthList) Append(x uint32) variableLengthList {
for x&0xffffff80 != 0 {
v = append(v, uint8((x&0x7f)|0x80))
x >>= 7
}
return append(v, uint8(x&0x7f))
}
// Package hyperloglog implements the HyperLogLog and HyperLogLog++ cardinality
// estimation algorithms.
// These algorithms are used for accurately estimating the cardinality of a
// multiset using constant memory. HyperLogLog++ has multiple improvements over
// HyperLogLog, with a much lower error rate for smaller cardinalities.
//
// HyperLogLog is described here:
// http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
//
// HyperLogLog++ is described here:
// http://research.google.com/pubs/pub40671.html
package hyperloglog
import (
"bytes"
"encoding/gob"
"errors"
"math"
)
const two32 = 1 << 32
type HyperLogLog struct {
reg []uint8
m uint32
p uint8
}
// New returns a new initialized HyperLogLog.
func New(precision uint8) (*HyperLogLog, error) {
if precision > 16 || precision < 4 {
return nil, errors.New("precision must be between 4 and 16")
}
h := &HyperLogLog{}
h.p = precision
h.m = 1 << precision
h.reg = make([]uint8, h.m)
return h, nil
}
// Clear sets HyperLogLog h back to its initial state.
func (h *HyperLogLog) Clear() {
h.reg = make([]uint8, h.m)
}
// Add adds a new item to HyperLogLog h.
func (h *HyperLogLog) Add(item Hash32) {
x := item.Sum32()
i := eb32(x, 32, 32-h.p) // {x31,...,x32-p}
w := x<<h.p | 1<<(h.p-1) // {x32-p,...,x0}
zeroBits := clz32(w) + 1
if zeroBits > h.reg[i] {
h.reg[i] = zeroBits
}
}
// Merge takes another HyperLogLog and combines it with HyperLogLog h.
func (h *HyperLogLog) Merge(other *HyperLogLog) error {
if h.p != other.p {
return errors.New("precisions must be equal")
}
for i, v := range other.reg {
if v > h.reg[i] {
h.reg[i] = v
}
}
return nil
}
// Count returns the cardinality estimate.
func (h *HyperLogLog) Count() uint64 {
est := calculateEstimate(h.reg)
if est <= float64(h.m)*2.5 {
if v := countZeros(h.reg); v != 0 {
return uint64(linearCounting(h.m, v))
}
return uint64(est)
} else if est < two32/30 {
return uint64(est)
}
return uint64(-two32 * math.Log(1-est/two32))
}
// Encode HyperLogLog into a gob
func (h *HyperLogLog) GobEncode() ([]byte, error) {
buf := bytes.Buffer{}
enc := gob.NewEncoder(&buf)
if err := enc.Encode(h.reg); err != nil {
return nil, err
}
if err := enc.Encode(h.m); err != nil {
return nil, err
}
if err := enc.Encode(h.p); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
// Decode gob into a HyperLogLog structure
func (h *HyperLogLog) GobDecode(b []byte) error {
dec := gob.NewDecoder(bytes.NewBuffer(b))
if err := dec.Decode(&h.reg); err != nil {
return err
}
if err := dec.Decode(&h.m); err != nil {
return err
}
if err := dec.Decode(&h.p); err != nil {
return err
}
return nil
}
package hyperloglog
import (
"bytes"
"encoding/gob"
"errors"
"sort"
)
const pPrime = 25
const mPrime = 1 << (pPrime - 1)
var threshold = []uint{
10, 20, 40, 80, 220, 400, 900, 1800, 3100,
6500, 11500, 20000, 50000, 120000, 350000,
}
type HyperLogLogPlus struct {
reg []uint8
p uint8
m uint32
sparse bool
tmpSet set
sparseList *compressedList
}
// Encode a hash to be used in the sparse representation.
func (h *HyperLogLogPlus) encodeHash(x uint64) uint32 {
idx := uint32(eb64(x, 64, 64-pPrime))
if eb64(x, 64-h.p, 64-pPrime) == 0 {
zeros := clz64((eb64(x, 64-pPrime, 0)<<pPrime)|(1<<pPrime-1)) + 1
return idx<<7 | uint32(zeros<<1) | 1
}
return idx << 1
}
// Get the index of precision p from the sparse representation.
func (h *HyperLogLogPlus) getIndex(k uint32) uint32 {
if k&1 == 1 {
return eb32(k, 32, 32-h.p)
}
return eb32(k, pPrime+1, pPrime-h.p+1)
}
// Decode a hash from the sparse representation.
func (h *HyperLogLogPlus) decodeHash(k uint32) (uint32, uint8) {
var r uint8
if k&1 == 1 {
r = uint8(eb32(k, 7, 1)) + pPrime - h.p
} else {
r = clz32(k<<(32-pPrime+h.p-1)) + 1
}
return h.getIndex(k), r
}
// Merge tmpSet and sparseList in the sparse representation.
func (h *HyperLogLogPlus) mergeSparse() {
keys := make(sortableSlice, 0, len(h.tmpSet))
for k := range h.tmpSet {
keys = append(keys, k)
}
sort.Sort(keys)
newList := newCompressedList(int(h.m))
for iter, i := h.sparseList.Iter(), 0; iter.HasNext() || i < len(keys); {
if !iter.HasNext() {
newList.Append(keys[i])
i++
continue
}
if i >= len(keys) {
newList.Append(iter.Next())
continue
}
x1, x2 := iter.Peek(), keys[i]
if x1 == x2 {
newList.Append(iter.Next())
i++
} else if x1 > x2 {
newList.Append(x2)
i++
} else {
newList.Append(iter.Next())
}
}
h.sparseList = newList
h.tmpSet = set{}
}
// NewPlus returns a new initialized HyperLogLogPlus that uses the HyperLogLog++
// algorithm.
func NewPlus(precision uint8) (*HyperLogLogPlus, error) {
if precision > 18 || precision < 4 {
return nil, errors.New("precision must be between 4 and 18")
}
h := &HyperLogLogPlus{}
h.p = precision
h.m = 1 << precision
h.sparse = true
h.tmpSet = set{}
h.sparseList = newCompressedList(int(h.m))
return h, nil
}
// Clear sets HyperLogLogPlus h back to its initial state.
func (h *HyperLogLogPlus) Clear() {
h.sparse = true
h.tmpSet = set{}
h.sparseList = newCompressedList(int(h.m))
h.reg = nil
}
// Converts HyperLogLogPlus h to the normal representation from the sparse
// representation.
func (h *HyperLogLogPlus) toNormal() {
if len(h.tmpSet) > 0 {
h.mergeSparse()
}
h.reg = make([]uint8, h.m)
for iter := h.sparseList.Iter(); iter.HasNext(); {
i, r := h.decodeHash(iter.Next())
if h.reg[i] < r {
h.reg[i] = r
}
}
h.sparse = false
h.tmpSet = nil
h.sparseList = nil
}
// Add adds a new item to HyperLogLogPlus h.
func (h *HyperLogLogPlus) Add(item Hash64) {
x := item.Sum64()
if h.sparse {
h.tmpSet.Add(h.encodeHash(x))
if uint32(len(h.tmpSet))*100 > h.m {
h.mergeSparse()
if uint32(h.sparseList.Len()) > h.m {
h.toNormal()
}
}
} else {
i := eb64(x, 64, 64-h.p) // {x63,...,x64-p}
w := x<<h.p | 1<<(h.p-1) // {x63-p,...,x0}
zeroBits := clz64(w) + 1
if zeroBits > h.reg[i] {
h.reg[i] = zeroBits
}
}
}
// Merge takes another HyperLogLogPlus and combines it with HyperLogLogPlus h.
// If HyperLogLogPlus h is using the sparse representation, it will be converted
// to the normal representation.
func (h *HyperLogLogPlus) Merge(other *HyperLogLogPlus) error {
if h.p != other.p {
return errors.New("precisions must be equal")
}
if h.sparse {
h.toNormal()
}
if other.sparse {
for k := range other.tmpSet {
i, r := other.decodeHash(k)
if h.reg[i] < r {
h.reg[i] = r
}
}
for iter := other.sparseList.Iter(); iter.HasNext(); {
i, r := other.decodeHash(iter.Next())
if h.reg[i] < r {
h.reg[i] = r
}
}
} else {
for i, v := range other.reg {
if v > h.reg[i] {
h.reg[i] = v
}
}
}
return nil
}
// Estimates the bias using empirically determined values.
func (h *HyperLogLogPlus) estimateBias(est float64) float64 {
estTable, biasTable := rawEstimateData[h.p-4], biasData[h.p-4]
if estTable[0] > est {
return estTable[0] - biasTable[0]
}
lastEstimate := estTable[len(estTable)-1]
if lastEstimate < est {
return lastEstimate - biasTable[len(biasTable)-1]
}
var i int
for i = 0; i < len(estTable) && estTable[i] < est; i++ {
}
e1, b1 := estTable[i-1], biasTable[i-1]
e2, b2 := estTable[i], biasTable[i]
c := (est - e1) / (e2 - e1)
return b1*(1-c) + b2*c
}
// Count returns the cardinality estimate.
func (h *HyperLogLogPlus) Count() uint64 {
if h.sparse {
h.mergeSparse()
return uint64(linearCounting(mPrime, mPrime-uint32(h.sparseList.Count)))
}
est := calculateEstimate(h.reg)
if est <= float64(h.m)*5.0 {
est -= h.estimateBias(est)
}
if v := countZeros(h.reg); v != 0 {
lc := linearCounting(h.m, v)
if lc <= float64(threshold[h.p-4]) {
return uint64(lc)
}
}
return uint64(est)
}
// Encode HyperLogLogPlus into a gob
func (h *HyperLogLogPlus) GobEncode() ([]byte, error) {
buf := bytes.Buffer{}
enc := gob.NewEncoder(&buf)
if err := enc.Encode(h.reg); err != nil {
return nil, err
}
if err := enc.Encode(h.m); err != nil {
return nil, err
}
if err := enc.Encode(h.p); err != nil {
return nil, err
}
if err := enc.Encode(h.sparse); err != nil {
return nil, err
}
if h.sparse {
if err := enc.Encode(h.tmpSet); err != nil {
return nil, err
}
if err := enc.Encode(h.sparseList.Count); err != nil {
return nil, err
}
if err := enc.Encode(h.sparseList.b); err != nil {
return nil, err
}
if err := enc.Encode(h.sparseList.last); err != nil {
return nil, err
}
}
return buf.Bytes(), nil
}
// Decode gob into a HyperLogLogPlus structure
func (h *HyperLogLogPlus) GobDecode(b []byte) error {
dec := gob.NewDecoder(bytes.NewBuffer(b))
if err := dec.Decode(&h.reg); err != nil {
return err
}
if err := dec.Decode(&h.m); err != nil {
return err
}
if err := dec.Decode(&h.p); err != nil {
return err
}
if err := dec.Decode(&h.sparse); err != nil {
return err
}
if h.sparse {
if err := dec.Decode(&h.tmpSet); err != nil {
return err
}
h.sparseList = &compressedList{}
if err := dec.Decode(&h.sparseList.Count); err != nil {
return err
}
if err := dec.Decode(&h.sparseList.b); err != nil {
return err
}
if err := dec.Decode(&h.sparseList.last); err != nil {
return err
}
}
return nil
}
{
"comment": "",
"ignore": "test",
"package": [
{
"checksumSHA1": "hYsU8oEn8ty3NM29nqeKDUOLT+0=",
"path": "github.com/clarkduvall/hyperloglog",
"revision": "97d5a1424b9f48accd787c7195755a003a670967",
"revisionTime": "2016-12-12T23:19:04Z"
}
],
"rootPath": "git.autistici.org/ale/iprep"
}
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!