// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main // import "golang.org/x/text/collate/tools/colcmp" import ( "bytes" "flag" "fmt" "io" "log" "os" "runtime/pprof" "sort" "strconv" "strings" "text/template" "time" "golang.org/x/text/unicode/norm" ) var ( doNorm = flag.Bool("norm", false, "normalize input strings") cases = flag.Bool("case", false, "generate case variants") verbose = flag.Bool("verbose", false, "print results") debug = flag.Bool("debug", false, "output debug information") locales = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.") col = flag.String("col", "go", "collator to test") gold = flag.String("gold", "go", "collator used as the gold standard") usecmp = flag.Bool("usecmp", false, `use comparison instead of sort keys when sorting. Must be "test", "gold" or "both"`) cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") exclude = flag.String("exclude", "", "exclude errors that contain any of the characters") limit = flag.Int("limit", 5000000, "maximum number of samples to generate for one run") ) func failOnError(err error) { if err != nil { log.Panic(err) } } // Test holds test data for testing a locale-collator pair. // Test also provides functionality that is commonly used by the various commands. type Test struct { ctxt *Context Name string Locale string ColName string Col Collator UseCompare bool Input []Input Duration time.Duration start time.Time msg string count int } func (t *Test) clear() { t.Col = nil t.Input = nil } const ( msgGeneratingInput = "generating input" msgGeneratingKeys = "generating keys" msgSorting = "sorting" ) var lastLen = 0 func (t *Test) SetStatus(msg string) { if *debug || *verbose { fmt.Printf("%s: %s...\n", t.Name, msg) } else if t.ctxt.out != nil { fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen)) fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen)) fmt.Fprint(t.ctxt.out, msg, "...") lastLen = len(msg) + 3 fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen)) } } // Start is used by commands to signal the start of an operation. func (t *Test) Start(msg string) { t.SetStatus(msg) t.count = 0 t.msg = msg t.start = time.Now() } // Stop is used by commands to signal the end of an operation. func (t *Test) Stop() (time.Duration, int) { d := time.Now().Sub(t.start) t.Duration += d if *debug || *verbose { fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000) } return d, t.count } // generateKeys generates sort keys for all the inputs. func (t *Test) generateKeys() { for i, s := range t.Input { b := t.Col.Key(s) t.Input[i].key = b if *debug { fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b) } } } // Sort sorts the inputs. It generates sort keys if this is required by the // chosen sort method. func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) { if *cpuprofile != "" { f, err := os.Create(*cpuprofile) failOnError(err) pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if t.UseCompare || t.Col.Key(t.Input[0]) == nil { t.Start(msgSorting) sort.Sort(&testCompare{*t}) tsort, nsort = t.Stop() } else { t.Start(msgGeneratingKeys) t.generateKeys() t.count = len(t.Input) tkey, nkey = t.Stop() t.Start(msgSorting) sort.Sort(t) tsort, nsort = t.Stop() } return } func (t *Test) Swap(a, b int) { t.Input[a], t.Input[b] = t.Input[b], t.Input[a] } func (t *Test) Less(a, b int) bool { t.count++ return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1 } func (t Test) Len() int { return len(t.Input) } type testCompare struct { Test } func (t *testCompare) Less(a, b int) bool { t.count++ return t.Col.Compare(t.Input[a], t.Input[b]) == -1 } type testRestore struct { Test } func (t *testRestore) Less(a, b int) bool { return t.Input[a].index < t.Input[b].index } // GenerateInput generates input phrases for the locale tested by t. func (t *Test) GenerateInput() { t.Input = nil if t.ctxt.lastLocale != t.Locale { gen := phraseGenerator{} gen.init(t.Locale) t.SetStatus(msgGeneratingInput) t.ctxt.lastInput = nil // allow the previous value to be garbage collected. t.Input = gen.generate(*doNorm) t.ctxt.lastInput = t.Input t.ctxt.lastLocale = t.Locale } else { t.Input = t.ctxt.lastInput for i := range t.Input { t.Input[i].key = nil } sort.Sort(&testRestore{*t}) } } // Context holds all tests and settings translated from command line options. type Context struct { test []*Test last *Test lastLocale string lastInput []Input out io.Writer } func (ts *Context) Printf(format string, a ...interface{}) { ts.assertBuf() fmt.Fprintf(ts.out, format, a...) } func (ts *Context) Print(a ...interface{}) { ts.assertBuf() fmt.Fprint(ts.out, a...) } // assertBuf sets up an io.Writer for output, if it doesn't already exist. // In debug and verbose mode, output is buffered so that the regular output // will not interfere with the additional output. Otherwise, output is // written directly to stdout for a more responsive feel. func (ts *Context) assertBuf() { if ts.out != nil { return } if *debug || *verbose { ts.out = &bytes.Buffer{} } else { ts.out = os.Stdout } } // flush flushes the contents of ts.out to stdout, if it is not stdout already. func (ts *Context) flush() { if ts.out != nil { if _, ok := ts.out.(io.ReadCloser); !ok { io.Copy(os.Stdout, ts.out.(io.Reader)) } } } // parseTests creates all tests from command lines and returns // a Context to hold them. func parseTests() *Context { ctxt := &Context{} colls := strings.Split(*col, ",") for _, loc := range strings.Split(*locales, ",") { loc = strings.TrimSpace(loc) for _, name := range colls { name = strings.TrimSpace(name) col := getCollator(name, loc) ctxt.test = append(ctxt.test, &Test{ ctxt: ctxt, Locale: loc, ColName: name, UseCompare: *usecmp, Col: col, }) } } return ctxt } func (c *Context) Len() int { return len(c.test) } func (c *Context) Test(i int) *Test { if c.last != nil { c.last.clear() } c.last = c.test[i] return c.last } func parseInput(args []string) []Input { input := []Input{} for _, s := range args { rs := []rune{} for len(s) > 0 { var r rune r, _, s, _ = strconv.UnquoteChar(s, '\'') rs = append(rs, r) } s = string(rs) if *doNorm { s = norm.NFD.String(s) } input = append(input, makeInputString(s)) } return input } // A Command is an implementation of a colcmp command. type Command struct { Run func(cmd *Context, args []string) Usage string Short string Long string } func (cmd Command) Name() string { return strings.SplitN(cmd.Usage, " ", 2)[0] } var commands = []*Command{ cmdSort, cmdBench, cmdRegress, } const sortHelp = ` Sort sorts a given list of strings. Strings are separated by whitespace. ` var cmdSort = &Command{ Run: runSort, Usage: "sort *", Short: "sort a given list of strings", Long: sortHelp, } func runSort(ctxt *Context, args []string) { input := parseInput(args) if len(input) == 0 { log.Fatalf("Nothing to sort.") } if ctxt.Len() > 1 { ctxt.Print("COLL LOCALE RESULT\n") } for i := 0; i < ctxt.Len(); i++ { t := ctxt.Test(i) t.Input = append(t.Input, input...) t.Sort() if ctxt.Len() > 1 { ctxt.Printf("%-5s %-5s ", t.ColName, t.Locale) } for _, s := range t.Input { ctxt.Print(string(s.UTF8), " ") } ctxt.Print("\n") } } const benchHelp = ` Bench runs a benchmark for the given list of collator implementations. If no collator implementations are given, the go collator will be used. ` var cmdBench = &Command{ Run: runBench, Usage: "bench", Short: "benchmark a given list of collator implementations", Long: benchHelp, } func runBench(ctxt *Context, args []string) { ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL") for i := 0; i < ctxt.Len(); i++ { t := ctxt.Test(i) ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName) t.GenerateInput() ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000)) tkey, tsort, nkey, nsort := t.Sort() p := func(dur time.Duration, n int) { s := "" if dur > 0 { s = fmt.Sprintf("%6.3fs ", dur.Seconds()) if n > 0 { s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n))) } } ctxt.Printf("%-24s ", s) } p(tkey, nkey) p(tsort, nsort) total := 0 for _, s := range t.Input { total += len(s.key) } ctxt.Printf("%-5d ", total/t.Len()) ctxt.Printf("%6.3fs\n", t.Duration.Seconds()) if *debug { for _, s := range t.Input { fmt.Print(string(s.UTF8), " ") } fmt.Println() } } } const regressHelp = ` Regress runs a monkey test by comparing the results of randomly generated tests between two implementations of a collator. The user may optionally pass a list of strings to regress against instead of the default test set. ` var cmdRegress = &Command{ Run: runRegress, Usage: "regress -gold= -test= [string]*", Short: "run a monkey test between two collators", Long: regressHelp, } const failedKeyCompare = ` %s:%d: incorrect comparison result for input: a: %q (%.4X) key: %s b: %q (%.4X) key: %s Compare(a, b) = %d; want %d. gold keys: a: %s b: %s ` const failedCompare = ` %s:%d: incorrect comparison result for input: a: %q (%.4X) b: %q (%.4X) Compare(a, b) = %d; want %d. ` func keyStr(b []byte) string { buf := &bytes.Buffer{} for _, v := range b { fmt.Fprintf(buf, "%.2X ", v) } return buf.String() } func runRegress(ctxt *Context, args []string) { input := parseInput(args) for i := 0; i < ctxt.Len(); i++ { t := ctxt.Test(i) if len(input) > 0 { t.Input = append(t.Input, input...) } else { t.GenerateInput() } t.Sort() count := 0 gold := getCollator(*gold, t.Locale) for i := 1; i < len(t.Input); i++ { ia := t.Input[i-1] ib := t.Input[i] if bytes.IndexAny(ib.UTF8, *exclude) != -1 { i++ continue } if bytes.IndexAny(ia.UTF8, *exclude) != -1 { continue } goldCmp := gold.Compare(ia, ib) if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp { count++ a := string(ia.UTF8) b := string(ib.UTF8) fmt.Printf(failedKeyCompare, t.Locale, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib))) } else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp { count++ a := string(ia.UTF8) b := string(ib.UTF8) fmt.Printf(failedCompare, t.Locale, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp) } } if count > 0 { ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1) } } } const helpTemplate = ` colcmp is a tool for testing and benchmarking collation Usage: colcmp command [arguments] The commands are: {{range .}} {{.Name | printf "%-11s"}} {{.Short}}{{end}} Use "col help [topic]" for more information about that topic. ` const detailedHelpTemplate = ` Usage: colcmp {{.Usage}} {{.Long | trim}} ` func runHelp(args []string) { t := template.New("help") t.Funcs(template.FuncMap{"trim": strings.TrimSpace}) if len(args) < 1 { template.Must(t.Parse(helpTemplate)) failOnError(t.Execute(os.Stderr, &commands)) } else { for _, cmd := range commands { if cmd.Name() == args[0] { template.Must(t.Parse(detailedHelpTemplate)) failOnError(t.Execute(os.Stderr, cmd)) os.Exit(0) } } log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0]) } os.Exit(0) } func main() { flag.Parse() log.SetFlags(0) ctxt := parseTests() if flag.NArg() < 1 { runHelp(nil) } args := flag.Args()[1:] if flag.Arg(0) == "help" { runHelp(args) } for _, cmd := range commands { if cmd.Name() == flag.Arg(0) { cmd.Run(ctxt, args) ctxt.flush() return } } runHelp(flag.Args()) }