diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0b46bb5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +shlex.test diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4a17268 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) anmitsu + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c4ffe72 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# go-shlex + +go-shlex is a library to make a lexical analyzer like Unix shell for +Go. + +## Install + + go get -u "github.com/anmitsu/go-shlex" + +## Usage + +```go +package main + +import ( + "fmt" + "log" + + "github.com/anmitsu/go-shlex" +) + +func main() { + cmd := `cp -Rdp "file name" 'file name2' dir\ name` + words, err := shlex.Split(cmd, true) + if err != nil { + log.Fatal(err) + } + + for _, w := range words { + fmt.Println(w) + } +} +``` + +## Documentation + +http://godoc.org/github.com/anmitsu/go-shlex + diff --git a/example_test.go b/example_test.go new file mode 100644 index 0000000..6533f5e --- /dev/null +++ b/example_test.go @@ -0,0 +1,100 @@ +package shlex_test + +import ( + "fmt" + "log" + + "github.com/anmitsu/go-shlex" + flynn_shlex "github.com/flynn/go-shlex" +) + +func ExampleSplit() { + cmd := `cp -Rdp "file name" 'file name2' dir\ name` + + // Split of cmd with POSIX mode. + words1, err := shlex.Split(cmd, true) + if err != nil { + log.Fatal(err) + } + // Split of cmd with Non-POSIX mode. + words2, err := shlex.Split(cmd, false) + if err != nil { + log.Fatal(err) + } + + fmt.Println("Source command:") + fmt.Println(`cp -Rdp "file name" 'file name2' dir\ name`) + fmt.Println() + + fmt.Println("POSIX mode:") + for _, word := range words1 { + fmt.Println(word) + } + fmt.Println() + fmt.Println("Non-POSIX mode:") + for _, word := range words2 { + fmt.Println(word) + } + + // Output: + // Source command: + // cp -Rdp "file name" 'file name2' dir\ name + // + // POSIX mode: + // cp + // -Rdp + // file name + // file name2 + // dir name + // + // Non-POSIX mode: + // cp + // -Rdp + // "file name" + // 'file name2' + // dir\ + // name +} + +func ExampleSplit_compareFlynn() { + cmd := `English and 日本語` + + // Split for github.com/flynn/go-shlex imported as flynn_shlex + words_flynn, err1 := flynn_shlex.Split(cmd) + + // Split for github.com/anmitsu/go-shlex + words_anmitsu, err2 := shlex.Split(cmd, true) + + fmt.Println("Source string:") + fmt.Println(cmd) + fmt.Println() + + fmt.Println("Result of github.com/flynn/go-shlex:") + for _, word := range words_flynn { + fmt.Println(word) + } + fmt.Println(err1.Error()) + + fmt.Println() + fmt.Println("Result of github.com/anmitsu/go-shlex:") + for _, word := range words_anmitsu { + fmt.Println(word) + } + if err2 != nil { + fmt.Println(err2.Error()) + } + + // Output: + // Source string: + // English and 日本語 + // + // Result of github.com/flynn/go-shlex: + // English + // and + // Unknown rune: 26085 + // + // Result of github.com/anmitsu/go-shlex: + // English + // and + // 日本語 +} diff --git a/shlex.go b/shlex.go new file mode 100644 index 0000000..e742c38 --- /dev/null +++ b/shlex.go @@ -0,0 +1,193 @@ +// Package shlex provides a simple lexical analysis like Unix shell. +package shlex + +import ( + "bufio" + "errors" + "io" + "strings" + "unicode" +) + +var ( + ErrNoClosing = errors.New("No closing quotation") + ErrNoEscaped = errors.New("No escaped character") +) + +// Tokenizer is the interface that classifies a token according to +// words, whitespaces, quotations, escapes and escaped quotations. +type Tokenizer interface { + IsWord(rune) bool + IsWhitespace(rune) bool + IsQuote(rune) bool + IsEscape(rune) bool + IsEscapedQuote(rune) bool +} + +// DefaultTokenizer implements a simple tokenizer like Unix shell. +type DefaultTokenizer struct{} + +func (t *DefaultTokenizer) IsWord(r rune) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsNumber(r) +} +func (t *DefaultTokenizer) IsQuote(r rune) bool { + switch r { + case '\'', '"': + return true + default: + return false + } +} +func (t *DefaultTokenizer) IsWhitespace(r rune) bool { + return unicode.IsSpace(r) +} +func (t *DefaultTokenizer) IsEscape(r rune) bool { + return r == '\\' +} +func (t *DefaultTokenizer) IsEscapedQuote(r rune) bool { + return r == '"' +} + +// Lexer represents a lexical analyzer. +type Lexer struct { + reader *bufio.Reader + tokenizer Tokenizer + posix bool + whitespacesplit bool +} + +// NewLexer creates a new Lexer reading from io.Reader. This Lexer +// has a DefaultTokenizer according to posix and whitespacesplit +// rules. +func NewLexer(r io.Reader, posix, whitespacesplit bool) *Lexer { + return &Lexer{ + reader: bufio.NewReader(r), + tokenizer: &DefaultTokenizer{}, + posix: posix, + whitespacesplit: whitespacesplit, + } +} + +// NewLexerString creates a new Lexer reading from a string. This +// Lexer has a DefaultTokenizer according to posix and whitespacesplit +// rules. +func NewLexerString(s string, posix, whitespacesplit bool) *Lexer { + return NewLexer(strings.NewReader(s), posix, whitespacesplit) +} + +// Split splits a string according to posix or non-posix rules. +func Split(s string, posix bool) ([]string, error) { + return NewLexerString(s, posix, true).Split() +} + +// SetTokenizer sets a Tokenizer. +func (l *Lexer) SetTokenizer(t Tokenizer) { + l.tokenizer = t +} + +func (l *Lexer) Split() ([]string, error) { + result := make([]string, 0) + for { + token, err := l.readToken() + if token != "" { + result = append(result, token) + } + + if err == io.EOF { + break + } else if err != nil { + return result, err + } + } + return result, nil +} + +func (l *Lexer) readToken() (string, error) { + t := l.tokenizer + token := "" + quoted := false + state := ' ' + escapedstate := ' ' +scanning: + for { + next, _, err := l.reader.ReadRune() + if err != nil { + if t.IsQuote(state) { + return token, ErrNoClosing + } else if t.IsEscape(state) { + return token, ErrNoEscaped + } + return token, err + } + + switch { + case t.IsWhitespace(state): + switch { + case t.IsWhitespace(next): + break scanning + case l.posix && t.IsEscape(next): + escapedstate = 'a' + state = next + case t.IsWord(next): + token += string(next) + state = 'a' + case t.IsQuote(next): + if !l.posix { + token += string(next) + } + state = next + default: + token = string(next) + if l.whitespacesplit { + state = 'a' + } else if token != "" || (l.posix && quoted) { + break scanning + } + } + case t.IsQuote(state): + quoted = true + switch { + case next == state: + if !l.posix { + token += string(next) + break scanning + } else { + state = 'a' + } + case l.posix && t.IsEscape(next) && t.IsEscapedQuote(state): + escapedstate = state + state = next + default: + token += string(next) + } + case t.IsEscape(state): + if t.IsQuote(escapedstate) && next != state && next != escapedstate { + token += string(state) + } + token += string(next) + state = escapedstate + case t.IsWord(state): + switch { + case t.IsWhitespace(next): + if token != "" || (l.posix && quoted) { + break scanning + } + case l.posix && t.IsQuote(next): + state = next + case l.posix && t.IsEscape(next): + escapedstate = 'a' + state = next + case t.IsWord(next) || t.IsQuote(next): + token += string(next) + default: + if l.whitespacesplit { + token += string(next) + } else if token != "" { + l.reader.UnreadRune() + break scanning + } + } + } + } + return token, nil +} diff --git a/shlex_test.go b/shlex_test.go new file mode 100644 index 0000000..ad96d57 --- /dev/null +++ b/shlex_test.go @@ -0,0 +1,297 @@ +package shlex + +import ( + "fmt" + "testing" +) + +var datanonposix = []struct { + in string + out []string + err error +}{ + {`This string has an embedded apostrophe, doesn't it?`, + []string{ + "This", + "string", + "has", + "an", + "embedded", + "apostrophe", + ",", + "doesn't", + "it", + "?", + }, + nil, + }, + {"This string has embedded \"double quotes\" and 'single quotes' in it,\nand even \"a 'nested example'\".\n", + []string{ + "This", + "string", + "has", + "embedded", + `"double quotes"`, + "and", + `'single quotes'`, + "in", + "it", + ",", + "and", + "even", + `"a 'nested example'"`, + ".", + }, + nil, + }, + {`Hello world!, こんにちは 世界!`, + []string{ + "Hello", + "world", + "!", + ",", + "こんにちは", + "世界", + "!", + }, + nil, + }, + {`Do"Not"Separate`, + []string{`Do"Not"Separate`}, + nil, + }, + {`"Do"Separate`, + []string{`"Do"`, "Separate"}, + nil, + }, + {`Escaped \e Character not in quotes`, + []string{ + "Escaped", + `\`, + "e", + "Character", + "not", + "in", + "quotes", + }, + nil, + }, + {`Escaped "\e" Character in double quotes`, + []string{ + "Escaped", + `"\e"`, + "Character", + "in", + "double", + "quotes", + }, + nil, + }, + {`Escaped '\e' Character in single quotes`, + []string{ + "Escaped", + `'\e'`, + "Character", + "in", + "single", + "quotes", + }, + nil, + }, + {`Escaped '\'' \"\'\" single quote`, + []string{ + "Escaped", + `'\'`, + `' \"\'`, + `\`, + `" single quote`, + }, + ErrNoClosing, + }, + {`Escaped "\"" \'\"\' double quote`, + []string{ + "Escaped", + `"\"`, + `" \'\"`, + `\`, + `' double quote`, + }, + ErrNoClosing, + }, + {`"'Strip extra layer of quotes'"`, + []string{`"'Strip extra layer of quotes'"`}, + nil, + }, +} + +var dataposix = []struct { + in string + out []string + err error +}{ + {`This string has an embedded apostrophe, doesn't it?`, + []string{ + "This", + "string", + "has", + "an", + "embedded", + "apostrophe", + ",", + "doesnt it?", + }, + ErrNoClosing, + }, + {"This string has embedded \"double quotes\" and 'single quotes' in it,\nand even \"a 'nested example'\".\n", + []string{ + "This", + "string", + "has", + "embedded", + `double quotes`, + "and", + `single quotes`, + "in", + "it", + ",", + "and", + "even", + `a 'nested example'`, + ".", + }, + nil, + }, + {`Hello world!, こんにちは 世界!`, + []string{ + "Hello", + "world", + "!", + ",", + "こんにちは", + "世界", + "!", + }, + nil, + }, + {`Do"Not"Separate`, + []string{`DoNotSeparate`}, + nil, + }, + {`"Do"Separate`, + []string{"DoSeparate"}, + nil, + }, + {`Escaped \e Character not in quotes`, + []string{ + "Escaped", + "e", + "Character", + "not", + "in", + "quotes", + }, + nil, + }, + {`Escaped "\e" Character in double quotes`, + []string{ + "Escaped", + `\e`, + "Character", + "in", + "double", + "quotes", + }, + nil, + }, + {`Escaped '\e' Character in single quotes`, + []string{ + "Escaped", + `\e`, + "Character", + "in", + "single", + "quotes", + }, + nil, + }, + {`Escaped '\'' \"\'\" single quote`, + []string{ + "Escaped", + `\ \"\"`, + "single", + "quote", + }, + nil, + }, + {`Escaped "\"" \'\"\' double quote`, + []string{ + "Escaped", + `"`, + `'"'`, + "double", + "quote", + }, + nil, + }, + {`"'Strip extra layer of quotes'"`, + []string{`'Strip extra layer of quotes'`}, + nil, + }, +} + +func TestSplitNonPOSIX(t *testing.T) { + testSplit(t, false) +} + +func TestSplitPOSIX(t *testing.T) { + testSplit(t, true) +} + +func testSplit(t *testing.T, posix bool) { + var data []struct { + in string + out []string + err error + } + if posix { + data = dataposix + } else { + data = datanonposix + } + + for _, d := range data { + t.Logf("Spliting: `%s'", d.in) + + result, err := NewLexerString(d.in, posix, false).Split() + + // check closing and escaped error + if err != d.err { + printToken(result) + t.Fatalf("Error expected: `%v', but result catched: `%v'", + d.err, err) + } + + // check splited number + if len(result) != len(d.out) { + printToken(result) + t.Fatalf("Split expeced: `%d', but result founds: `%d'", + len(d.out), len(result)) + } + + // check words + for j, out := range d.out { + if result[j] != out { + printToken(result) + t.Fatalf("Word expeced: `%s', but result founds: `%s' in %d", + out, result[j], j) + } + } + t.Log("ok") + } +} + +func printToken(s []string) { + for _, token := range s { + fmt.Println(token) + } +}