// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search // Package search provides language-specific search and string matching. // // Natural language matching can be intricate. For example, Danish will insist // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note // the lack of a dot) in a case-insensitive match. This package handles such // language-specific details. // // Text passed to any of the calls in this message does not need to be // normalized. package search // import "golang.org/x/text/search" import ( "strings" "golang.org/x/text/internal/colltab" "golang.org/x/text/language" ) // An Option configures a Matcher. type Option func(*Matcher) var ( // WholeWord restricts matches to complete words. The default is to match at // the character level. WholeWord Option = nil // Exact requires that two strings are their exact equivalent. For example // å would not match aa in Danish. It overrides any of the ignore options. Exact Option = nil // Loose causes case, diacritics and width to be ignored. Loose Option = loose // IgnoreCase enables case-insensitive search. IgnoreCase Option = ignoreCase // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o"). IgnoreDiacritics Option = ignoreDiacritics // IgnoreWidth equates narrow with wide variants. IgnoreWidth Option = ignoreWidth ) func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true } func ignoreCase(m *Matcher) { m.ignoreCase = true } func ignoreWidth(m *Matcher) { m.ignoreWidth = true } func loose(m *Matcher) { ignoreDiacritics(m) ignoreCase(m) ignoreWidth(m) } var ( // Supported lists the languages for which search differs from its parent. Supported language.Coverage tags []language.Tag ) func init() { ids := strings.Split(availableLocales, ",") tags = make([]language.Tag, len(ids)) for i, s := range ids { tags[i] = language.Raw.MustParse(s) } Supported = language.NewCoverage(tags) } // New returns a new Matcher for the given language and options. func New(t language.Tag, opts ...Option) *Matcher { m := &Matcher{ w: getTable(locales[colltab.MatchLang(t, tags)]), } for _, f := range opts { f(m) } return m } // A Matcher implements language-specific string matching. type Matcher struct { w colltab.Weighter ignoreCase bool ignoreWidth bool ignoreDiacritics bool } // An IndexOption specifies how the Index methods of Pattern or Matcher should // match the input. type IndexOption byte const ( // Anchor restricts the search to the start (or end for Backwards) of the // text. Anchor IndexOption = 1 << iota // Backwards starts the search from the end of the text. Backwards anchorBackwards = Anchor | Backwards ) // Index reports the start and end position of the first occurrence of pat in b // or -1, -1 if pat is not present. func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) { // TODO: implement optimized version that does not use a pattern. return m.Compile(pat).Index(b, opts...) } // IndexString reports the start and end position of the first occurrence of pat // in s or -1, -1 if pat is not present. func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) { // TODO: implement optimized version that does not use a pattern. return m.CompileString(pat).IndexString(s, opts...) } // Equal reports whether a and b are equivalent. func (m *Matcher) Equal(a, b []byte) bool { _, end := m.Index(a, b, Anchor) return end == len(a) } // EqualString reports whether a and b are equivalent. func (m *Matcher) EqualString(a, b string) bool { _, end := m.IndexString(a, b, Anchor) return end == len(a) } // Compile compiles and returns a pattern that can be used for faster searching. func (m *Matcher) Compile(b []byte) *Pattern { p := &Pattern{m: m} iter := colltab.Iter{Weighter: m.w} for iter.SetInput(b); iter.Next(); { } p.ce = iter.Elems p.deleteEmptyElements() return p } // CompileString compiles and returns a pattern that can be used for faster // searching. func (m *Matcher) CompileString(s string) *Pattern { p := &Pattern{m: m} iter := colltab.Iter{Weighter: m.w} for iter.SetInputString(s); iter.Next(); { } p.ce = iter.Elems p.deleteEmptyElements() return p } // A Pattern is a compiled search string. It is safe for concurrent use. type Pattern struct { m *Matcher ce []colltab.Elem } // Design note (TODO remove): // The cost of retrieving collation elements for each rune, which is used for // search as well, is not trivial. Also, algorithms like Boyer-Moore and // Sunday require some additional precomputing. // Index reports the start and end position of the first occurrence of p in b // or -1, -1 if p is not present. func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) { // Pick a large enough buffer such that we likely do not need to allocate // and small enough to not cause too much overhead initializing. var buf [8]colltab.Elem it := &colltab.Iter{ Weighter: p.m.w, Elems: buf[:0], } it.SetInput(b) var optMask IndexOption for _, o := range opts { optMask |= o } switch optMask { case 0: return p.forwardSearch(it) case Anchor: return p.anchoredForwardSearch(it) case Backwards, anchorBackwards: panic("TODO: implement") default: panic("unrecognized option") } } // IndexString reports the start and end position of the first occurrence of p // in s or -1, -1 if p is not present. func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) { // Pick a large enough buffer such that we likely do not need to allocate // and small enough to not cause too much overhead initializing. var buf [8]colltab.Elem it := &colltab.Iter{ Weighter: p.m.w, Elems: buf[:0], } it.SetInputString(s) var optMask IndexOption for _, o := range opts { optMask |= o } switch optMask { case 0: return p.forwardSearch(it) case Anchor: return p.anchoredForwardSearch(it) case Backwards, anchorBackwards: panic("TODO: implement") default: panic("unrecognized option") } } // TODO: // - Maybe IndexAll methods (probably not necessary). // - Some way to match patterns in a Reader (a bit tricky). // - Some fold transformer that folds text to comparable text, based on the // search options. This is a common technique, though very different from the // collation-based design of this package. It has a somewhat different use // case, so probably makes sense to support both. Should probably be in a // different package, though, as it uses completely different kind of tables // (based on norm, cases, width and range tables.)