Commit 37aff84a authored by ale's avatar ale

updated dependencies

parent 0b69db2f
......@@ -7,7 +7,7 @@
"Deps": [
{
"ImportPath": "github.com/blevesearch/bleve",
"Rev": "1006bf54b4782b0644e66a600c27e80652937d91"
"Rev": "0b171c85da0922a0baf5fd559ec8515dd35cadb8"
},
{
"ImportPath": "github.com/blevesearch/go-porterstemmer",
......@@ -20,8 +20,8 @@
},
{
"ImportPath": "github.com/boltdb/bolt",
"Comment": "v1.1.0-67-g2f846c3",
"Rev": "2f846c3551b76d7710f159be840d66c3d064abbe"
"Comment": "v1.2.0-11-g831b652",
"Rev": "831b652a7f8dbefaf94da0eb66abd46c0c4bcf23"
},
{
"ImportPath": "github.com/golang/protobuf/proto",
......@@ -29,7 +29,7 @@
},
{
"ImportPath": "github.com/golang/snappy",
"Rev": "c2359a1bd0bd4a2de4f1bd92ccd045fb60d0a994"
"Rev": "5f1c01d9f64b941dd9582c638279d046eda6ca31"
},
{
"ImportPath": "github.com/gorilla/context",
......@@ -53,7 +53,7 @@
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "e7e6f5b5ef25adb580feac515f9ccec514d0bda8"
"Rev": "93fc893f2dadb96ffde441c7546cc67ea290a3a8"
},
{
"ImportPath": "github.com/willf/bitset",
......@@ -66,19 +66,19 @@
},
{
"ImportPath": "golang.org/x/text/encoding",
"Rev": "07b9a78963006a15c538ec5175243979025fa7a8"
"Rev": "1b466db55e0ba5d56ef5315c728216b42f796491"
},
{
"ImportPath": "golang.org/x/text/internal/utf8internal",
"Rev": "07b9a78963006a15c538ec5175243979025fa7a8"
"Rev": "1b466db55e0ba5d56ef5315c728216b42f796491"
},
{
"ImportPath": "golang.org/x/text/runes",
"Rev": "07b9a78963006a15c538ec5175243979025fa7a8"
"Rev": "1b466db55e0ba5d56ef5315c728216b42f796491"
},
{
"ImportPath": "golang.org/x/text/transform",
"Rev": "07b9a78963006a15c538ec5175243979025fa7a8"
"Rev": "1b466db55e0ba5d56ef5315c728216b42f796491"
}
]
}
......@@ -4,6 +4,8 @@
.#*
.project
.settings
**/.idea/
**/*.iml
.DS_Store
/analysis/token_filters/cld2/cld2-read-only
/analysis/token_filters/cld2/libcld2_full.a
......
# ![bleve](docs/bleve.png) bleve
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)[![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve)
modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/)
......
......@@ -12,14 +12,14 @@ package simple_analyzer
import (
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/letter"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "simple"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
tokenizer, err := cache.TokenizerNamed(letter.Name)
if err != nil {
return nil, err
}
......
......@@ -7,6 +7,13 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// Package en implements an analyzer with reasonable defaults for processing
// English text.
//
// It strips possessive suffixes ('s), transforms tokens to lower case,
// removes stopwords from a built-in list, and applies porter stemming.
//
// The built-in stopwords list is defined in EnglishStopWords.
package en
import (
......
......@@ -16,6 +16,8 @@ import (
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
// PossessiveName is the name PossessiveFilter is registered as
// in the bleve registry.
const PossessiveName = "possessive_en"
const rightSingleQuotationMark = '’'
......@@ -24,6 +26,11 @@ const fullWidthApostrophe = '''
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
// PossessiveFilter implements a TokenFilter which
// strips the English possessive suffix ('s) from tokens.
// It handle a variety of apostrophe types, is case-insensitive
// and doesn't distinguish between possessive and contraction.
// (ie "She's So Rad" becomes "She So Rad")
type PossessiveFilter struct {
}
......
......@@ -7,10 +7,11 @@ import (
const StopName = "stop_en"
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
//
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
......
......@@ -7,6 +7,8 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// Package lower_case_filter implements a TokenFilter which converts
// tokens to lower case according to unicode rules.
package lower_case_filter
import (
......@@ -18,6 +20,7 @@ import (
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
// Name is the name used to register LowerCaseFilter in the bleve registry
const Name = "to_lower"
type LowerCaseFilter struct {
......
......@@ -23,6 +23,9 @@ func NewTokenMap() TokenMap {
return make(TokenMap, 0)
}
// LoadFile reads in a list of tokens from a text file,
// one per line.
// Comments are supported using `#` or `|`
func (t TokenMap) LoadFile(filename string) error {
data, err := ioutil.ReadFile(filename)
if err != nil {
......@@ -31,6 +34,9 @@ func (t TokenMap) LoadFile(filename string) error {
return t.LoadBytes(data)
}
// LoadBytes reads in a list of tokens from memory,
// one per line.
// Comments are supported using `#` or `|`
func (t TokenMap) LoadBytes(data []byte) error {
bytesReader := bytes.NewReader(data)
bufioReader := bufio.NewReader(bytesReader)
......
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2016 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
......@@ -7,77 +7,65 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ja
package character
import (
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
"unicode/utf8"
"github.com/ikawaha/kagome/tokenizer"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
const TokenizerName = "kagome"
type KagomeMorphTokenizer struct {
tok tokenizer.Tokenizer
}
func init() {
_ = tokenizer.SysDic() // prepare system dictionary
}
type IsTokenRune func(r rune) bool
func NewKagomeMorphTokenizer() *KagomeMorphTokenizer {
return &KagomeMorphTokenizer{
tok: tokenizer.New(),
}
type CharacterTokenizer struct {
isTokenRun IsTokenRune
}
func NewKagomeMorphTokenizerWithUserDic(userdic tokenizer.UserDic) *KagomeMorphTokenizer {
k := tokenizer.New()
k.SetUserDic(userdic)
return &KagomeMorphTokenizer{
tok: k,
func NewCharacterTokenizer(f IsTokenRune) *CharacterTokenizer {
return &CharacterTokenizer{
isTokenRun: f,
}
}
func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream {
var (
morphs []tokenizer.Token
prevstart int
)
rv := make(analysis.TokenStream, 0, len(input))
if len(input) < 1 {
return rv
}
morphs = t.tok.Analyze(string(input), tokenizer.Search)
for i, m := range morphs {
if m.Surface == "EOS" || m.Surface == "BOS" {
continue
}
surfacelen := len(m.Surface)
token := &analysis.Token{
Term: []byte(m.Surface),
Position: i,
Start: prevstart,
End: prevstart + surfacelen,
Type: analysis.Ideographic,
func (c *CharacterTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, 1024)
offset := 0
start := 0
end := 0
count := 0
for currRune, size := utf8.DecodeRune(input[offset:]); currRune != utf8.RuneError; currRune, size = utf8.DecodeRune(input[offset:]) {
isToken := c.isTokenRun(currRune)
if isToken {
end = offset + size
} else {
if end-start > 0 {
// build token
rv = append(rv, &analysis.Token{
Term: input[start:end],
Start: start,
End: end,
Position: count + 1,
Type: analysis.AlphaNumeric,
})
count++
}
start = offset + size
end = start
}
prevstart = prevstart + surfacelen
rv = append(rv, token)
offset += size
}
// if we ended in the middle of a token, finish it
if end-start > 0 {
// build token
rv = append(rv, &analysis.Token{
Term: input[start:end],
Start: start,
End: end,
Position: count + 1,
Type: analysis.AlphaNumeric,
})
}
return rv
}
func KagomeMorphTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
return NewKagomeMorphTokenizer(), nil
}
func init() {
registry.RegisterTokenizer(TokenizerName, KagomeMorphTokenizerConstructor)
}
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2016 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
......@@ -7,31 +7,22 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ja
package letter
import (
"unicode"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/character"
"git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "ja"
const Name = "letter"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
kagomeTokenizer, err := cache.TokenizerNamed(TokenizerName)
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
rv := analysis.Analyzer{
Tokenizer: kagomeTokenizer,
TokenFilters: []analysis.TokenFilter{
normalizeFilter,
},
}
return &rv, nil
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
return character.NewCharacterTokenizer(unicode.IsLetter), nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
registry.RegisterTokenizer(Name, TokenizerConstructor)
}
......@@ -88,6 +88,7 @@ import (
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/index/store/boltdb"
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/index/store/goleveldb"
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/index/store/gtreap"
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/index/store/moss"
// index types
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/index/firestorm"
......
......@@ -12,5 +12,5 @@
package config
import (
_ "git.autistici.org/ale/liber/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ja"
_ "github.com/blevesearch/blevex/lang/ja"
)
......@@ -55,3 +55,18 @@ func (d *Document) GoString() string {
}
return fmt.Sprintf("&document.Document{ID:%s, Fields: %s, CompositeFields: %s}", d.ID, fields, compositeFields)
}
func (d *Document) NumPlainTextBytes() uint64 {
rv := uint64(0)
for _, field := range d.Fields {
rv += field.NumPlainTextBytes()
}
for _, compositeField := range d.CompositeFields {
for _, field := range d.Fields {
if compositeField.includesField(field.Name()) {
rv += field.NumPlainTextBytes()
}
}
}
return rv
}
......@@ -26,4 +26,9 @@ type Field interface {
Options() IndexingOptions
Analyze() (int, analysis.TokenFrequencies)
Value() []byte
// NumPlainTextBytes should return the number of plain text bytes
// that this field represents - this is a common metric for tracking
// the rate of indexing
NumPlainTextBytes() uint64
}
......@@ -18,10 +18,11 @@ import (
const DefaultBooleanIndexingOptions = StoreField | IndexField
type BooleanField struct {
name string
arrayPositions []uint64
options IndexingOptions
value []byte
name string
arrayPositions []uint64
options IndexingOptions
value []byte
numPlainTextBytes uint64
}
func (b *BooleanField) Name() string {
......@@ -66,12 +67,17 @@ func (b *BooleanField) GoString() string {
return fmt.Sprintf("&document.BooleanField{Name:%s, Options: %s, Value: %s}", b.name, b.options, b.value)
}
func (b *BooleanField) NumPlainTextBytes() uint64 {
return b.numPlainTextBytes
}
func NewBooleanFieldFromBytes(name string, arrayPositions []uint64, value []byte) *BooleanField {
return &BooleanField{
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultNumericIndexingOptions,
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultNumericIndexingOptions,
numPlainTextBytes: uint64(len(value)),
}
}
......@@ -80,14 +86,17 @@ func NewBooleanField(name string, arrayPositions []uint64, b bool) *BooleanField
}
func NewBooleanFieldWithIndexingOptions(name string, arrayPositions []uint64, b bool, options IndexingOptions) *BooleanField {
numPlainTextBytes := 5
v := []byte("F")
if b {
numPlainTextBytes = 4
v = []byte("T")
}
return &BooleanField{
name: name,
arrayPositions: arrayPositions,
value: v,
options: options,
name: name,
arrayPositions: arrayPositions,
value: v,
options: options,
numPlainTextBytes: uint64(numPlainTextBytes),
}
}
......@@ -69,7 +69,11 @@ func (c *CompositeField) Value() []byte {
return []byte{}
}
func (c *CompositeField) Compose(field string, length int, freq analysis.TokenFrequencies) {
func (c *CompositeField) NumPlainTextBytes() uint64 {
return 0
}
func (c *CompositeField) includesField(field string) bool {
shouldInclude := c.defaultInclude
_, fieldShouldBeIncluded := c.includedFields[field]
if fieldShouldBeIncluded {
......@@ -79,8 +83,11 @@ func (c *CompositeField) Compose(field string, length int, freq analysis.TokenFr
if fieldShouldBeExcluded {
shouldInclude = false
}
return shouldInclude
}
if shouldInclude {
func (c *CompositeField) Compose(field string, length int, freq analysis.TokenFrequencies) {
if c.includesField(field) {
c.totalLength += length
c.compositeFrequencies.MergeAll(field, freq)
}
......
......@@ -25,10 +25,11 @@ var MinTimeRepresentable = time.Unix(0, math.MinInt64)
var MaxTimeRepresentable = time.Unix(0, math.MaxInt64)
type DateTimeField struct {
name string
arrayPositions []uint64
options IndexingOptions
value numeric_util.PrefixCoded
name string
arrayPositions []uint64
options IndexingOptions
value numeric_util.PrefixCoded
numPlainTextBytes uint64
}
func (n *DateTimeField) Name() string {
......@@ -95,12 +96,17 @@ func (n *DateTimeField) GoString() string {
return fmt.Sprintf("&document.DateField{Name:%s, Options: %s, Value: %s}", n.name, n.options, n.value)
}
func (n *DateTimeField) NumPlainTextBytes() uint64 {
return n.numPlainTextBytes
}
func NewDateTimeFieldFromBytes(name string, arrayPositions []uint64, value []byte) *DateTimeField {
return &DateTimeField{
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultDateTimeIndexingOptions,
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultDateTimeIndexingOptions,
numPlainTextBytes: uint64(len(value)),
}
}
......@@ -117,6 +123,9 @@ func NewDateTimeFieldWithIndexingOptions(name string, arrayPositions []uint64, d
arrayPositions: arrayPositions,
value: prefixCoded,
options: options,
// not correct, just a place holder until we revisit how fields are
// represented and can fix this better
numPlainTextBytes: uint64(8),
}, nil
}
return nil, fmt.Errorf("cannot represent %s in this type", dt)
......
......@@ -21,10 +21,11 @@ const DefaultNumericIndexingOptions = StoreField | IndexField
const DefaultPrecisionStep uint = 4
type NumericField struct {
name string
arrayPositions []uint64
options IndexingOptions
value numeric_util.PrefixCoded
name string
arrayPositions []uint64
options IndexingOptions
value numeric_util.PrefixCoded
numPlainTextBytes uint64
}
func (n *NumericField) Name() string {
......@@ -91,12 +92,17 @@ func (n *NumericField) GoString() string {
return fmt.Sprintf("&document.NumericField{Name:%s, Options: %s, Value: %s}", n.name, n.options, n.value)
}
func (n *NumericField) NumPlainTextBytes() uint64 {
return n.numPlainTextBytes
}
func NewNumericFieldFromBytes(name string, arrayPositions []uint64, value []byte) *NumericField {
return &NumericField{
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultNumericIndexingOptions,
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultNumericIndexingOptions,
numPlainTextBytes: uint64(len(value)),
}
}
......@@ -112,5 +118,8 @@ func NewNumericFieldWithIndexingOptions(name string, arrayPositions []uint64, nu
arrayPositions: arrayPositions,
value: prefixCoded,
options: options,
// not correct, just a place holder until we revisit how fields are
// represented and can fix this better
numPlainTextBytes: uint64(8),
}
}
......@@ -18,11 +18,12 @@ import (
const DefaultTextIndexingOptions = IndexField
type TextField struct {
name string
arrayPositions []uint64
options IndexingOptions
analyzer *analysis.Analyzer
value []byte
name string
arrayPositions []uint64
options IndexingOptions
analyzer *analysis.Analyzer
value []byte
numPlainTextBytes uint64
}
func (t *TextField) Name() string {
......@@ -72,35 +73,42 @@ func (t *TextField) GoString() string {
return fmt.Sprintf("&document.TextField{Name:%s, Options: %s, Analyzer: %v, Value: %s, ArrayPositions: %v}", t.name, t.options, t.analyzer, t.value, t.arrayPositions)
}
func (t *TextField) NumPlainTextBytes() uint64 {
return t.numPlainTextBytes
}
func NewTextField(name string, arrayPositions []uint64, value []byte) *TextField {
return NewTextFieldWithIndexingOptions(name, arrayPositions, value, DefaultTextIndexingOptions)
}
func NewTextFieldWithIndexingOptions(name string, arrayPositions []uint64, value []byte, options IndexingOptions) *TextField {
return &TextField{
name: name,
arrayPositions: arrayPositions,
options: options,
value: value,
name: name,
arrayPositions: arrayPositions,
options: options,
value: value,
numPlainTextBytes: uint64(len(value)),
}
}
func NewTextFieldWithAnalyzer(name string, arrayPositions []uint64, value []byte, analyzer *analysis.Analyzer) *TextField {
return &TextField{
name: name,
arrayPositions: arrayPositions,
options: DefaultTextIndexingOptions,
analyzer: analyzer,
value: value,
name: name,