grafana/pkg/services/searchV2/ngram.go
Alexander Emelin 0975ea4df8
Search: Filter punctuation and tokenize camel case (#51165)
Co-authored-by: Ryan McKinley <ryantxu@gmail.com>
2022-06-30 16:30:44 -07:00

48 lines
1.2 KiB
Go

package searchV2
import (
"strings"
"github.com/blugelabs/bluge/analysis"
"github.com/blugelabs/bluge/analysis/token"
"github.com/blugelabs/bluge/analysis/tokenizer"
)
var punctuationReplacer *strings.Replacer
func init() {
var punctuation = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
args := make([]string, 0, len(punctuation)*2)
for _, r := range punctuation {
args = append(args, string(r), " ")
}
punctuationReplacer = strings.NewReplacer(args...)
}
type punctuationCharFilter struct{}
func (t *punctuationCharFilter) Filter(input []byte) []byte {
return []byte(punctuationReplacer.Replace(string(input)))
}
const ngramEdgeFilterMaxLength = 7
var ngramIndexAnalyzer = &analysis.Analyzer{
CharFilters: []analysis.CharFilter{&punctuationCharFilter{}},
Tokenizer: tokenizer.NewWhitespaceTokenizer(),
TokenFilters: []analysis.TokenFilter{
token.NewCamelCaseFilter(),
token.NewLowerCaseFilter(),
token.NewEdgeNgramFilter(token.FRONT, 1, ngramEdgeFilterMaxLength),
},
}
var ngramQueryAnalyzer = &analysis.Analyzer{
CharFilters: []analysis.CharFilter{&punctuationCharFilter{}},
Tokenizer: tokenizer.NewWhitespaceTokenizer(),
TokenFilters: []analysis.TokenFilter{
token.NewCamelCaseFilter(),
token.NewLowerCaseFilter(),
},
}