Skip to content
Snippets Groups Projects
Commit 1050328b authored by ale's avatar ale
Browse files

Merge branch 'renovate/github.com-puerkitobio-purell-1.x' into 'master'

Update module github.com/PuerkitoBio/purell to v1

See merge request !6
parents bd810938 55b2a516
No related branches found
No related tags found
1 merge request!6Update module github.com/PuerkitoBio/purell to v1
Pipeline #42517 passed
Showing
with 12715 additions and 0 deletions
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package idna
// appendMapping appends the mapping for the respective rune. isMapped must be
// true. A mapping is a categorization of a rune as defined in UTS #46.
func (c info) appendMapping(b []byte, s string) []byte {
index := int(c >> indexShift)
if c&xorBit == 0 {
s := mappings[index:]
return append(b, s[1:s[0]+1]...)
}
b = append(b, s...)
if c&inlineXOR == inlineXOR {
// TODO: support and handle two-byte inline masks
b[len(b)-1] ^= byte(index)
} else {
for p := len(b) - int(xorData[index]); p < len(b); p++ {
index++
b[p] ^= xorData[index]
}
}
return b
}
// Sparse block handling code.
type valueRange struct {
value uint16 // header: value:stride
lo, hi byte // header: lo:n
}
type sparseBlocks struct {
values []valueRange
offset []uint16
}
var idnaSparse = sparseBlocks{
values: idnaSparseValues[:],
offset: idnaSparseOffset[:],
}
// Don't use newIdnaTrie to avoid unconditional linking in of the table.
var trie = &idnaTrie{}
// lookup determines the type of block n and looks up the value for b.
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
// is a list of ranges with an accompanying value. Given a matching range r,
// the value for b is by r.value + (b - r.lo) * stride.
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 {
offset := t.offset[n]
header := t.values[offset]
lo := offset + 1
hi := lo + uint16(header.lo)
for lo < hi {
m := lo + (hi-lo)/2
r := t.values[m]
if r.lo <= b && b <= r.hi {
return r.value + uint16(b-r.lo)*header.value
}
if b < r.lo {
hi = m
} else {
lo = m + 1
}
}
return 0
}
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package idna
// This file contains definitions for interpreting the trie value of the idna
// trie generated by "go run gen*.go". It is shared by both the generator
// program and the resultant package. Sharing is achieved by the generator
// copying gen_trieval.go to trieval.go and changing what's above this comment.
// info holds information from the IDNA mapping table for a single rune. It is
// the value returned by a trie lookup. In most cases, all information fits in
// a 16-bit value. For mappings, this value may contain an index into a slice
// with the mapped string. Such mappings can consist of the actual mapped value
// or an XOR pattern to be applied to the bytes of the UTF8 encoding of the
// input rune. This technique is used by the cases packages and reduces the
// table size significantly.
//
// The per-rune values have the following format:
//
// if mapped {
// if inlinedXOR {
// 15..13 inline XOR marker
// 12..11 unused
// 10..3 inline XOR mask
// } else {
// 15..3 index into xor or mapping table
// }
// } else {
// 15..14 unused
// 13 mayNeedNorm
// 12..11 attributes
// 10..8 joining type
// 7..3 category type
// }
// 2 use xor pattern
// 1..0 mapped category
//
// See the definitions below for a more detailed description of the various
// bits.
type info uint16
const (
catSmallMask = 0x3
catBigMask = 0xF8
indexShift = 3
xorBit = 0x4 // interpret the index as an xor pattern
inlineXOR = 0xE000 // These bits are set if the XOR pattern is inlined.
joinShift = 8
joinMask = 0x07
// Attributes
attributesMask = 0x1800
viramaModifier = 0x1800
modifier = 0x1000
rtl = 0x0800
mayNeedNorm = 0x2000
)
// A category corresponds to a category defined in the IDNA mapping table.
type category uint16
const (
unknown category = 0 // not currently defined in unicode.
mapped category = 1
disallowedSTD3Mapped category = 2
deviation category = 3
)
const (
valid category = 0x08
validNV8 category = 0x18
validXV8 category = 0x28
disallowed category = 0x40
disallowedSTD3Valid category = 0x80
ignored category = 0xC0
)
// join types and additional rune information
const (
joiningL = (iota + 1)
joiningD
joiningT
joiningR
//the following types are derived during processing
joinZWJ
joinZWNJ
joinVirama
numJoinTypes
)
func (c info) isMapped() bool {
return c&0x3 != 0
}
func (c info) category() category {
small := c & catSmallMask
if small != 0 {
return category(small)
}
return category(c & catBigMask)
}
func (c info) joinType() info {
if c.isMapped() {
return 0
}
return (c >> joinShift) & joinMask
}
func (c info) isModifier() bool {
return c&(modifier|catSmallMask) == modifier
}
func (c info) isViramaModifier() bool {
return c&(attributesMask|catSmallMask) == viramaModifier
}
# This source code refers to The Go Authors for copyright purposes.
# The master list of authors is in the main Go distribution,
# visible at http://tip.golang.org/AUTHORS.
# This source code was written by the Go contributors.
# The master list of contributors is in the main Go distribution,
# visible at http://tip.golang.org/CONTRIBUTORS.
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Additional IP Rights Grant (Patents)
"This implementation" means the copyrightable works distributed by
Google as part of the Go project.
Google hereby grants to You a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer and otherwise run, modify and propagate the contents of this
implementation of Go, where such license applies only to those patent
claims, both currently owned or controlled by Google and acquired in
the future, licensable by Google that are necessarily infringed by this
implementation of Go. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of Go or any code incorporated within this
implementation of Go constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of Go
shall terminate as of the date such litigation is filed.
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bidirule implements the Bidi Rule defined by RFC 5893.
//
// This package is under development. The API may change without notice and
// without preserving backward compatibility.
package bidirule
import (
"errors"
"unicode/utf8"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/bidi"
)
// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
// Internationalized Domain Names for Applications (IDNA)
//
// A label is an individual component of a domain name. Labels are usually
// shown separated by dots; for example, the domain name "www.example.com" is
// composed of three labels: "www", "example", and "com".
//
// An RTL label is a label that contains at least one character of class R, AL,
// or AN. An LTR label is any label that is not an RTL label.
//
// A "Bidi domain name" is a domain name that contains at least one RTL label.
//
// The following guarantees can be made based on the above:
//
// o In a domain name consisting of only labels that satisfy the rule,
// the requirements of Section 3 are satisfied. Note that even LTR
// labels and pure ASCII labels have to be tested.
//
// o In a domain name consisting of only LDH labels (as defined in the
// Definitions document [RFC5890]) and labels that satisfy the rule,
// the requirements of Section 3 are satisfied as long as a label
// that starts with an ASCII digit does not come after a
// right-to-left label.
//
// No guarantee is given for other combinations.
// ErrInvalid indicates a label is invalid according to the Bidi Rule.
var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
type ruleState uint8
const (
ruleInitial ruleState = iota
ruleLTR
ruleLTRFinal
ruleRTL
ruleRTLFinal
ruleInvalid
)
type ruleTransition struct {
next ruleState
mask uint16
}
var transitions = [...][2]ruleTransition{
// [2.1] The first character must be a character with Bidi property L, R, or
// AL. If it has the R or AL property, it is an RTL label; if it has the L
// property, it is an LTR label.
ruleInitial: {
{ruleLTRFinal, 1 << bidi.L},
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
},
ruleRTL: {
// [2.3] In an RTL label, the end of the label must be a character with
// Bidi property R, AL, EN, or AN, followed by zero or more characters
// with Bidi property NSM.
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
// [2.2] In an RTL label, only characters with the Bidi properties R,
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.3]
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
},
ruleRTLFinal: {
// [2.3] In an RTL label, the end of the label must be a character with
// Bidi property R, AL, EN, or AN, followed by zero or more characters
// with Bidi property NSM.
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
// [2.2] In an RTL label, only characters with the Bidi properties R,
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.3] and NSM.
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
},
ruleLTR: {
// [2.6] In an LTR label, the end of the label must be a character with
// Bidi property L or EN, followed by zero or more characters with Bidi
// property NSM.
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
// [2.5] In an LTR label, only characters with the Bidi properties L,
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.6].
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
},
ruleLTRFinal: {
// [2.6] In an LTR label, the end of the label must be a character with
// Bidi property L or EN, followed by zero or more characters with Bidi
// property NSM.
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
// [2.5] In an LTR label, only characters with the Bidi properties L,
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.6].
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
},
ruleInvalid: {
{ruleInvalid, 0},
{ruleInvalid, 0},
},
}
// [2.4] In an RTL label, if an EN is present, no AN may be present, and
// vice versa.
const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
// From RFC 5893
// An RTL label is a label that contains at least one character of type
// R, AL, or AN.
//
// An LTR label is any label that is not an RTL label.
// Direction reports the direction of the given label as defined by RFC 5893.
// The Bidi Rule does not have to be applied to labels of the category
// LeftToRight.
func Direction(b []byte) bidi.Direction {
for i := 0; i < len(b); {
e, sz := bidi.Lookup(b[i:])
if sz == 0 {
i++
}
c := e.Class()
if c == bidi.R || c == bidi.AL || c == bidi.AN {
return bidi.RightToLeft
}
i += sz
}
return bidi.LeftToRight
}
// DirectionString reports the direction of the given label as defined by RFC
// 5893. The Bidi Rule does not have to be applied to labels of the category
// LeftToRight.
func DirectionString(s string) bidi.Direction {
for i := 0; i < len(s); {
e, sz := bidi.LookupString(s[i:])
if sz == 0 {
i++
continue
}
c := e.Class()
if c == bidi.R || c == bidi.AL || c == bidi.AN {
return bidi.RightToLeft
}
i += sz
}
return bidi.LeftToRight
}
// Valid reports whether b conforms to the BiDi rule.
func Valid(b []byte) bool {
var t Transformer
if n, ok := t.advance(b); !ok || n < len(b) {
return false
}
return t.isFinal()
}
// ValidString reports whether s conforms to the BiDi rule.
func ValidString(s string) bool {
var t Transformer
if n, ok := t.advanceString(s); !ok || n < len(s) {
return false
}
return t.isFinal()
}
// New returns a Transformer that verifies that input adheres to the Bidi Rule.
func New() *Transformer {
return &Transformer{}
}
// Transformer implements transform.Transform.
type Transformer struct {
state ruleState
hasRTL bool
seen uint16
}
// A rule can only be violated for "Bidi Domain names", meaning if one of the
// following categories has been observed.
func (t *Transformer) isRTL() bool {
const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
return t.seen&isRTL != 0
}
// Reset implements transform.Transformer.
func (t *Transformer) Reset() { *t = Transformer{} }
// Transform implements transform.Transformer. This Transformer has state and
// needs to be reset between uses.
func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < len(src) {
src = src[:len(dst)]
atEOF = false
err = transform.ErrShortDst
}
n, err1 := t.Span(src, atEOF)
copy(dst, src[:n])
if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
err = err1
}
return n, n, err
}
// Span returns the first n bytes of src that conform to the Bidi rule.
func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
if t.state == ruleInvalid && t.isRTL() {
return 0, ErrInvalid
}
n, ok := t.advance(src)
switch {
case !ok:
err = ErrInvalid
case n < len(src):
if !atEOF {
err = transform.ErrShortSrc
break
}
err = ErrInvalid
case !t.isFinal():
err = ErrInvalid
}
return n, err
}
// Precomputing the ASCII values decreases running time for the ASCII fast path
// by about 30%.
var asciiTable [128]bidi.Properties
func init() {
for i := range asciiTable {
p, _ := bidi.LookupRune(rune(i))
asciiTable[i] = p
}
}
func (t *Transformer) advance(s []byte) (n int, ok bool) {
var e bidi.Properties
var sz int
for n < len(s) {
if s[n] < utf8.RuneSelf {
e, sz = asciiTable[s[n]], 1
} else {
e, sz = bidi.Lookup(s[n:])
if sz <= 1 {
if sz == 1 {
// We always consider invalid UTF-8 to be invalid, even if
// the string has not yet been determined to be RTL.
// TODO: is this correct?
return n, false
}
return n, true // incomplete UTF-8 encoding
}
}
// TODO: using CompactClass would result in noticeable speedup.
// See unicode/bidi/prop.go:Properties.CompactClass.
c := uint16(1 << e.Class())
t.seen |= c
if t.seen&exclusiveRTL == exclusiveRTL {
t.state = ruleInvalid
return n, false
}
switch tr := transitions[t.state]; {
case tr[0].mask&c != 0:
t.state = tr[0].next
case tr[1].mask&c != 0:
t.state = tr[1].next
default:
t.state = ruleInvalid
if t.isRTL() {
return n, false
}
}
n += sz
}
return n, true
}
func (t *Transformer) advanceString(s string) (n int, ok bool) {
var e bidi.Properties
var sz int
for n < len(s) {
if s[n] < utf8.RuneSelf {
e, sz = asciiTable[s[n]], 1
} else {
e, sz = bidi.LookupString(s[n:])
if sz <= 1 {
if sz == 1 {
return n, false // invalid UTF-8
}
return n, true // incomplete UTF-8 encoding
}
}
// TODO: using CompactClass results in noticeable speedup.
// See unicode/bidi/prop.go:Properties.CompactClass.
c := uint16(1 << e.Class())
t.seen |= c
if t.seen&exclusiveRTL == exclusiveRTL {
t.state = ruleInvalid
return n, false
}
switch tr := transitions[t.state]; {
case tr[0].mask&c != 0:
t.state = tr[0].next
case tr[1].mask&c != 0:
t.state = tr[1].next
default:
t.state = ruleInvalid
if t.isRTL() {
return n, false
}
}
n += sz
}
return n, true
}
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.10
// +build go1.10
package bidirule
func (t *Transformer) isFinal() bool {
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
}
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !go1.10
// +build !go1.10
package bidirule
func (t *Transformer) isFinal() bool {
if !t.isRTL() {
return true
}
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
}
This diff is collapsed.
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_trieval.go gen_ranges.go
// Package bidi contains functionality for bidirectional text support.
//
// See https://www.unicode.org/reports/tr9.
//
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
// and without notice.
package bidi // import "golang.org/x/text/unicode/bidi"
// TODO
// - Transformer for reordering?
// - Transformer (validator, really) for Bidi Rule.
import (
"bytes"
)
// This API tries to avoid dealing with embedding levels for now. Under the hood
// these will be computed, but the question is to which extent the user should
// know they exist. We should at some point allow the user to specify an
// embedding hierarchy, though.
// A Direction indicates the overall flow of text.
type Direction int
const (
// LeftToRight indicates the text contains no right-to-left characters and
// that either there are some left-to-right characters or the option
// DefaultDirection(LeftToRight) was passed.
LeftToRight Direction = iota
// RightToLeft indicates the text contains no left-to-right characters and
// that either there are some right-to-left characters or the option
// DefaultDirection(RightToLeft) was passed.
RightToLeft
// Mixed indicates text contains both left-to-right and right-to-left
// characters.
Mixed
// Neutral means that text contains no left-to-right and right-to-left
// characters and that no default direction has been set.
Neutral
)
type options struct {
defaultDirection Direction
}
// An Option is an option for Bidi processing.
type Option func(*options)
// ICU allows the user to define embedding levels. This may be used, for example,
// to use hierarchical structure of markup languages to define embeddings.
// The following option may be a way to expose this functionality in this API.
// // LevelFunc sets a function that associates nesting levels with the given text.
// // The levels function will be called with monotonically increasing values for p.
// func LevelFunc(levels func(p int) int) Option {
// panic("unimplemented")
// }
// DefaultDirection sets the default direction for a Paragraph. The direction is
// overridden if the text contains directional characters.
func DefaultDirection(d Direction) Option {
return func(opts *options) {
opts.defaultDirection = d
}
}
// A Paragraph holds a single Paragraph for Bidi processing.
type Paragraph struct {
p []byte
o Ordering
opts []Option
types []Class
pairTypes []bracketType
pairValues []rune
runes []rune
options options
}
// Initialize the p.pairTypes, p.pairValues and p.types from the input previously
// set by p.SetBytes() or p.SetString(). Also limit the input up to (and including) a paragraph
// separator (bidi class B).
//
// The function p.Order() needs these values to be set, so this preparation could be postponed.
// But since the SetBytes and SetStrings functions return the length of the input up to the paragraph
// separator, the whole input needs to be processed anyway and should not be done twice.
//
// The function has the same return values as SetBytes() / SetString()
func (p *Paragraph) prepareInput() (n int, err error) {
p.runes = bytes.Runes(p.p)
bytecount := 0
// clear slices from previous SetString or SetBytes
p.pairTypes = nil
p.pairValues = nil
p.types = nil
for _, r := range p.runes {
props, i := LookupRune(r)
bytecount += i
cls := props.Class()
if cls == B {
return bytecount, nil
}
p.types = append(p.types, cls)
if props.IsOpeningBracket() {
p.pairTypes = append(p.pairTypes, bpOpen)
p.pairValues = append(p.pairValues, r)
} else if props.IsBracket() {
// this must be a closing bracket,
// since IsOpeningBracket is not true
p.pairTypes = append(p.pairTypes, bpClose)
p.pairValues = append(p.pairValues, r)
} else {
p.pairTypes = append(p.pairTypes, bpNone)
p.pairValues = append(p.pairValues, 0)
}
}
return bytecount, nil
}
// SetBytes configures p for the given paragraph text. It replaces text
// previously set by SetBytes or SetString. If b contains a paragraph separator
// it will only process the first paragraph and report the number of bytes
// consumed from b including this separator. Error may be non-nil if options are
// given.
func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) {
p.p = b
p.opts = opts
return p.prepareInput()
}
// SetString configures s for the given paragraph text. It replaces text
// previously set by SetBytes or SetString. If s contains a paragraph separator
// it will only process the first paragraph and report the number of bytes
// consumed from s including this separator. Error may be non-nil if options are
// given.
func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) {
p.p = []byte(s)
p.opts = opts
return p.prepareInput()
}
// IsLeftToRight reports whether the principle direction of rendering for this
// paragraphs is left-to-right. If this returns false, the principle direction
// of rendering is right-to-left.
func (p *Paragraph) IsLeftToRight() bool {
return p.Direction() == LeftToRight
}
// Direction returns the direction of the text of this paragraph.
//
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
func (p *Paragraph) Direction() Direction {
return p.o.Direction()
}
// TODO: what happens if the position is > len(input)? This should return an error.
// RunAt reports the Run at the given position of the input text.
//
// This method can be used for computing line breaks on paragraphs.
func (p *Paragraph) RunAt(pos int) Run {
c := 0
runNumber := 0
for i, r := range p.o.runes {
c += len(r)
if pos < c {
runNumber = i
}
}
return p.o.Run(runNumber)
}
func calculateOrdering(levels []level, runes []rune) Ordering {
var curDir Direction
prevDir := Neutral
prevI := 0
o := Ordering{}
// lvl = 0,2,4,...: left to right
// lvl = 1,3,5,...: right to left
for i, lvl := range levels {
if lvl%2 == 0 {
curDir = LeftToRight
} else {
curDir = RightToLeft
}
if curDir != prevDir {
if i > 0 {
o.runes = append(o.runes, runes[prevI:i])
o.directions = append(o.directions, prevDir)
o.startpos = append(o.startpos, prevI)
}
prevI = i
prevDir = curDir
}
}
o.runes = append(o.runes, runes[prevI:])
o.directions = append(o.directions, prevDir)
o.startpos = append(o.startpos, prevI)
return o
}
// Order computes the visual ordering of all the runs in a Paragraph.
func (p *Paragraph) Order() (Ordering, error) {
if len(p.types) == 0 {
return Ordering{}, nil
}
for _, fn := range p.opts {
fn(&p.options)
}
lvl := level(-1)
if p.options.defaultDirection == RightToLeft {
lvl = 1
}
para, err := newParagraph(p.types, p.pairTypes, p.pairValues, lvl)
if err != nil {
return Ordering{}, err
}
levels := para.getLevels([]int{len(p.types)})
p.o = calculateOrdering(levels, p.runes)
return p.o, nil
}
// Line computes the visual ordering of runs for a single line starting and
// ending at the given positions in the original text.
func (p *Paragraph) Line(start, end int) (Ordering, error) {
lineTypes := p.types[start:end]
para, err := newParagraph(lineTypes, p.pairTypes[start:end], p.pairValues[start:end], -1)
if err != nil {
return Ordering{}, err
}
levels := para.getLevels([]int{len(lineTypes)})
o := calculateOrdering(levels, p.runes[start:end])
return o, nil
}
// An Ordering holds the computed visual order of runs of a Paragraph. Calling
// SetBytes or SetString on the originating Paragraph invalidates an Ordering.
// The methods of an Ordering should only be called by one goroutine at a time.
type Ordering struct {
runes [][]rune
directions []Direction
startpos []int
}
// Direction reports the directionality of the runs.
//
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
func (o *Ordering) Direction() Direction {
return o.directions[0]
}
// NumRuns returns the number of runs.
func (o *Ordering) NumRuns() int {
return len(o.runes)
}
// Run returns the ith run within the ordering.
func (o *Ordering) Run(i int) Run {
r := Run{
runes: o.runes[i],
direction: o.directions[i],
startpos: o.startpos[i],
}
return r
}
// TODO: perhaps with options.
// // Reorder creates a reader that reads the runes in visual order per character.
// // Modifiers remain after the runes they modify.
// func (l *Runs) Reorder() io.Reader {
// panic("unimplemented")
// }
// A Run is a continuous sequence of characters of a single direction.
type Run struct {
runes []rune
direction Direction
startpos int
}
// String returns the text of the run in its original order.
func (r *Run) String() string {
return string(r.runes)
}
// Bytes returns the text of the run in its original order.
func (r *Run) Bytes() []byte {
return []byte(r.String())
}
// TODO: methods for
// - Display order
// - headers and footers
// - bracket replacement.
// Direction reports the direction of the run.
func (r *Run) Direction() Direction {
return r.direction
}
// Pos returns the position of the Run within the text passed to SetBytes or SetString of the
// originating Paragraph value.
func (r *Run) Pos() (start, end int) {
return r.startpos, r.startpos + len(r.runes) - 1
}
// AppendReverse reverses the order of characters of in, appends them to out,
// and returns the result. Modifiers will still follow the runes they modify.
// Brackets are replaced with their counterparts.
func AppendReverse(out, in []byte) []byte {
ret := make([]byte, len(in)+len(out))
copy(ret, out)
inRunes := bytes.Runes(in)
for i, r := range inRunes {
prop, _ := LookupRune(r)
if prop.IsBracket() {
inRunes[i] = prop.reverseBracket(r)
}
}
for i, j := 0, len(inRunes)-1; i < j; i, j = i+1, j-1 {
inRunes[i], inRunes[j] = inRunes[j], inRunes[i]
}
copy(ret[len(out):], string(inRunes))
return ret
}
// ReverseString reverses the order of characters in s and returns a new string.
// Modifiers will still follow the runes they modify. Brackets are replaced with
// their counterparts.
func ReverseString(s string) string {
input := []rune(s)
li := len(input)
ret := make([]rune, li)
for i, r := range input {
prop, _ := LookupRune(r)
if prop.IsBracket() {
ret[li-i-1] = prop.reverseBracket(r)
} else {
ret[li-i-1] = r
}
}
return string(ret)
}
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bidi
import (
"container/list"
"fmt"
"sort"
)
// This file contains a port of the reference implementation of the
// Bidi Parentheses Algorithm:
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
//
// The implementation in this file covers definitions BD14-BD16 and rule N0
// of UAX#9.
//
// Some preprocessing is done for each rune before data is passed to this
// algorithm:
// - opening and closing brackets are identified
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
// is identical for the opening and closing bracket. It is left to do these
// mappings.
// - The BPA algorithm requires that bracket characters that are canonical
// equivalents of each other be able to be substituted for each other.
// It is the responsibility of the caller to do this canonicalization.
//
// In implementing BD16, this implementation departs slightly from the "logical"
// algorithm defined in UAX#9. In particular, the stack referenced there
// supports operations that go beyond a "basic" stack. An equivalent
// implementation based on a linked list is used here.
// Bidi_Paired_Bracket_Type
// BD14. An opening paired bracket is a character whose
// Bidi_Paired_Bracket_Type property value is Open.
//
// BD15. A closing paired bracket is a character whose
// Bidi_Paired_Bracket_Type property value is Close.
type bracketType byte
const (
bpNone bracketType = iota
bpOpen
bpClose
)
// bracketPair holds a pair of index values for opening and closing bracket
// location of a bracket pair.
type bracketPair struct {
opener int
closer int
}
func (b *bracketPair) String() string {
return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
}
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
type bracketPairs []bracketPair
func (b bracketPairs) Len() int { return len(b) }
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
//
// For each rune, it takes the indexes into the original string, the class the
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
// takes the direction type for the start-of-sentence and the embedding level.
//
// The identifiers for bracket types are the rune of the canonicalized opening
// bracket for brackets (open or close) or 0 for runes that are not brackets.
func resolvePairedBrackets(s *isolatingRunSequence) {
p := bracketPairer{
sos: s.sos,
openers: list.New(),
codesIsolatedRun: s.types,
indexes: s.indexes,
}
dirEmbed := L
if s.level&1 != 0 {
dirEmbed = R
}
p.locateBrackets(s.p.pairTypes, s.p.pairValues)
p.resolveBrackets(dirEmbed, s.p.initialTypes)
}
type bracketPairer struct {
sos Class // direction corresponding to start of sequence
// The following is a restatement of BD 16 using non-algorithmic language.
//
// A bracket pair is a pair of characters consisting of an opening
// paired bracket and a closing paired bracket such that the
// Bidi_Paired_Bracket property value of the former equals the latter,
// subject to the following constraints.
// - both characters of a pair occur in the same isolating run sequence
// - the closing character of a pair follows the opening character
// - any bracket character can belong at most to one pair, the earliest possible one
// - any bracket character not part of a pair is treated like an ordinary character
// - pairs may nest properly, but their spans may not overlap otherwise
// Bracket characters with canonical decompositions are supposed to be
// treated as if they had been normalized, to allow normalized and non-
// normalized text to give the same result. In this implementation that step
// is pushed out to the caller. The caller has to ensure that the pairValue
// slices contain the rune of the opening bracket after normalization for
// any opening or closing bracket.
openers *list.List // list of positions for opening brackets
// bracket pair positions sorted by location of opening bracket
pairPositions bracketPairs
codesIsolatedRun []Class // directional bidi codes for an isolated run
indexes []int // array of index values into the original string
}
// matchOpener reports whether characters at given positions form a matching
// bracket pair.
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
}
const maxPairingDepth = 63
// locateBrackets locates matching bracket pairs according to BD16.
//
// This implementation uses a linked list instead of a stack, because, while
// elements are added at the front (like a push) they are not generally removed
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
// traverse the run
// do that explicitly (not in a for-each) so we can record position
for i, index := range p.indexes {
// look at the bracket type for each character
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
// continue scanning
continue
}
switch pairTypes[index] {
case bpOpen:
// check if maximum pairing depth reached
if p.openers.Len() == maxPairingDepth {
p.openers.Init()
return
}
// remember opener location, most recent first
p.openers.PushFront(i)
case bpClose:
// see if there is a match
count := 0
for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
count++
opener := elem.Value.(int)
if p.matchOpener(pairValues, opener, i) {
// if the opener matches, add nested pair to the ordered list
p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
// remove up to and including matched opener
for ; count > 0; count-- {
p.openers.Remove(p.openers.Front())
}
break
}
}
sort.Sort(p.pairPositions)
// if we get here, the closing bracket matched no openers
// and gets ignored
}
}
}
// Bracket pairs within an isolating run sequence are processed as units so
// that both the opening and the closing paired bracket in a pair resolve to
// the same direction.
//
// N0. Process bracket pairs in an isolating run sequence sequentially in
// the logical order of the text positions of the opening paired brackets
// using the logic given below. Within this scope, bidirectional types EN
// and AN are treated as R.
//
// Identify the bracket pairs in the current isolating run sequence
// according to BD16. For each bracket-pair element in the list of pairs of
// text positions:
//
// a Inspect the bidirectional types of the characters enclosed within the
// bracket pair.
//
// b If any strong type (either L or R) matching the embedding direction is
// found, set the type for both brackets in the pair to match the embedding
// direction.
//
// o [ e ] o -> o e e e o
//
// o [ o e ] -> o e o e e
//
// o [ NI e ] -> o e NI e e
//
// c Otherwise, if a strong type (opposite the embedding direction) is
// found, test for adjacent strong types as follows: 1 First, check
// backwards before the opening paired bracket until the first strong type
// (L, R, or sos) is found. If that first preceding strong type is opposite
// the embedding direction, then set the type for both brackets in the pair
// to that type. 2 Otherwise, set the type for both brackets in the pair to
// the embedding direction.
//
// o [ o ] e -> o o o o e
//
// o [ o NI ] o -> o o o NI o o
//
// e [ o ] o -> e e o e o
//
// e [ o ] e -> e e o e e
//
// e ( o [ o ] NI ) e -> e e o o o o NI e e
//
// d Otherwise, do not set the type for the current bracket pair. Note that
// if the enclosed text contains no strong types the paired brackets will
// both resolve to the same level when resolved individually using rules N1
// and N2.
//
// e ( NI ) o -> e ( NI ) o
// getStrongTypeN0 maps character's directional code to strong type as required
// by rule N0.
//
// TODO: have separate type for "strong" directionality.
func (p *bracketPairer) getStrongTypeN0(index int) Class {
switch p.codesIsolatedRun[index] {
// in the scope of N0, number types are treated as R
case EN, AN, AL, R:
return R
case L:
return L
default:
return ON
}
}
// classifyPairContent reports the strong types contained inside a Bracket Pair,
// assuming the given embedding direction.
//
// It returns ON if no strong type is found. If a single strong type is found,
// it returns this type. Otherwise it returns the embedding direction.
//
// TODO: use separate type for "strong" directionality.
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
dirOpposite := ON
for i := loc.opener + 1; i < loc.closer; i++ {
dir := p.getStrongTypeN0(i)
if dir == ON {
continue
}
if dir == dirEmbed {
return dir // type matching embedding direction found
}
dirOpposite = dir
}
// return ON if no strong type found, or class opposite to dirEmbed
return dirOpposite
}
// classBeforePair determines which strong types are present before a Bracket
// Pair. Return R or L if strong type found, otherwise ON.
func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
for i := loc.opener - 1; i >= 0; i-- {
if dir := p.getStrongTypeN0(i); dir != ON {
return dir
}
}
// no strong types found, return sos
return p.sos
}
// assignBracketType implements rule N0 for a single bracket pair.
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
// rule "N0, a", inspect contents of pair
dirPair := p.classifyPairContent(loc, dirEmbed)
// dirPair is now L, R, or N (no strong type found)
// the following logical tests are performed out of order compared to
// the statement of the rules but yield the same results
if dirPair == ON {
return // case "d" - nothing to do
}
if dirPair != dirEmbed {
// case "c": strong type found, opposite - check before (c.1)
dirPair = p.classBeforePair(loc)
if dirPair == dirEmbed || dirPair == ON {
// no strong opposite type found before - use embedding (c.2)
dirPair = dirEmbed
}
}
// else: case "b", strong type found matching embedding,
// no explicit action needed, as dirPair is already set to embedding
// direction
// set the bracket types to the type found
p.setBracketsToType(loc, dirPair, initialTypes)
}
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
p.codesIsolatedRun[loc.opener] = dirPair
p.codesIsolatedRun[loc.closer] = dirPair
for i := loc.opener + 1; i < loc.closer; i++ {
index := p.indexes[i]
if initialTypes[index] != NSM {
break
}
p.codesIsolatedRun[i] = dirPair
}
for i := loc.closer + 1; i < len(p.indexes); i++ {
index := p.indexes[i]
if initialTypes[index] != NSM {
break
}
p.codesIsolatedRun[i] = dirPair
}
}
// resolveBrackets implements rule N0 for a list of pairs.
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
for _, loc := range p.pairPositions {
p.assignBracketType(loc, dirEmbed, initialTypes)
}
}
This diff is collapsed.
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bidi
import "unicode/utf8"
// Properties provides access to BiDi properties of runes.
type Properties struct {
entry uint8
last uint8
}
var trie = newBidiTrie(0)
// TODO: using this for bidirule reduces the running time by about 5%. Consider
// if this is worth exposing or if we can find a way to speed up the Class
// method.
//
// // CompactClass is like Class, but maps all of the BiDi control classes
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
// func (p Properties) CompactClass() Class {
// return Class(p.entry & 0x0F)
// }
// Class returns the Bidi class for p.
func (p Properties) Class() Class {
c := Class(p.entry & 0x0F)
if c == Control {
c = controlByteToClass[p.last&0xF]
}
return c
}
// IsBracket reports whether the rune is a bracket.
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
// IsOpeningBracket reports whether the rune is an opening bracket.
// IsBracket must return true.
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
// TODO: find a better API and expose.
func (p Properties) reverseBracket(r rune) rune {
return xorMasks[p.entry>>xorMaskShift] ^ r
}
var controlByteToClass = [16]Class{
0xD: LRO, // U+202D LeftToRightOverride,
0xE: RLO, // U+202E RightToLeftOverride,
0xA: LRE, // U+202A LeftToRightEmbedding,
0xB: RLE, // U+202B RightToLeftEmbedding,
0xC: PDF, // U+202C PopDirectionalFormat,
0x6: LRI, // U+2066 LeftToRightIsolate,
0x7: RLI, // U+2067 RightToLeftIsolate,
0x8: FSI, // U+2068 FirstStrongIsolate,
0x9: PDI, // U+2069 PopDirectionalIsolate,
}
// LookupRune returns properties for r.
func LookupRune(r rune) (p Properties, size int) {
var buf [4]byte
n := utf8.EncodeRune(buf[:], r)
return Lookup(buf[:n])
}
// TODO: these lookup methods are based on the generated trie code. The returned
// sizes have slightly different semantics from the generated code, in that it
// always returns size==1 for an illegal UTF-8 byte (instead of the length
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
// leave invalid UTF-8 untouched, in which case it has performance benefits to
// do so (without changing the semantics). Bidi requires the semantics used here
// for the bidirule implementation to be compatible with the Go semantics.
// They ultimately should perhaps be adopted by all trie implementations, for
// convenience sake.
// This unrolled code also boosts performance of the secure/bidirule package by
// about 30%.
// So, to remove this code:
// - add option to trie generator to define return type.
// - always return 1 byte size for ill-formed UTF-8 runes.
// Lookup returns properties for the first rune in s and the width in bytes of
// its encoding. The size will be 0 if s does not hold enough bytes to complete
// the encoding.
func Lookup(s []byte) (p Properties, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII
return Properties{entry: bidiValues[c0]}, 1
case c0 < 0xC2:
return Properties{}, 1
case c0 < 0xE0: // 2-byte UTF-8
if len(s) < 2 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
case c0 < 0xF0: // 3-byte UTF-8
if len(s) < 3 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
case c0 < 0xF8: // 4-byte UTF-8
if len(s) < 4 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
o = uint32(i)<<6 + uint32(c2)
i = bidiIndex[o]
c3 := s[3]
if c3 < 0x80 || 0xC0 <= c3 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
}
// Illegal rune
return Properties{}, 1
}
// LookupString returns properties for the first rune in s and the width in
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
// complete the encoding.
func LookupString(s string) (p Properties, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII
return Properties{entry: bidiValues[c0]}, 1
case c0 < 0xC2:
return Properties{}, 1
case c0 < 0xE0: // 2-byte UTF-8
if len(s) < 2 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
case c0 < 0xF0: // 3-byte UTF-8
if len(s) < 3 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
case c0 < 0xF8: // 4-byte UTF-8
if len(s) < 4 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
o = uint32(i)<<6 + uint32(c2)
i = bidiIndex[o]
c3 := s[3]
if c3 < 0x80 || 0xC0 <= c3 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
}
// Illegal rune
return Properties{}, 1
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package bidi
// Class is the Unicode BiDi class. Each rune has a single class.
type Class uint
const (
L Class = iota // LeftToRight
R // RightToLeft
EN // EuropeanNumber
ES // EuropeanSeparator
ET // EuropeanTerminator
AN // ArabicNumber
CS // CommonSeparator
B // ParagraphSeparator
S // SegmentSeparator
WS // WhiteSpace
ON // OtherNeutral
BN // BoundaryNeutral
NSM // NonspacingMark
AL // ArabicLetter
Control // Control LRO - PDI
numClass
LRO // LeftToRightOverride
RLO // RightToLeftOverride
LRE // LeftToRightEmbedding
RLE // RightToLeftEmbedding
PDF // PopDirectionalFormat
LRI // LeftToRightIsolate
RLI // RightToLeftIsolate
FSI // FirstStrongIsolate
PDI // PopDirectionalIsolate
unknownClass = ^Class(0)
)
var controlToClass = map[rune]Class{
0x202D: LRO, // LeftToRightOverride,
0x202E: RLO, // RightToLeftOverride,
0x202A: LRE, // LeftToRightEmbedding,
0x202B: RLE, // RightToLeftEmbedding,
0x202C: PDF, // PopDirectionalFormat,
0x2066: LRI, // LeftToRightIsolate,
0x2067: RLI, // RightToLeftIsolate,
0x2068: FSI, // FirstStrongIsolate,
0x2069: PDI, // PopDirectionalIsolate,
}
// A trie entry has the following bits:
// 7..5 XOR mask for brackets
// 4 1: Bracket open, 0: Bracket close
// 3..0 Class type
const (
openMask = 0x10
xorMaskShift = 5
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment