libchess/pkg/pgn/lexer.go
2020-05-08 22:55:36 +02:00

333 lines
5.6 KiB
Go

package pgn
import (
"bufio"
"bytes"
"fmt"
)
// EOF signals end of input
const EOF = -1
// TokenType defines the type of a token
type TokenType uint64
// The following TokenTypes exist:
const (
TokenInvalid TokenType = iota
TokenEOF
TokenDiv
TokenNewline
TokenWhitespace
TokenComment
TokenString
TokenBracketLeft
TokenBracketRight
TokenParenthesisLeft
TokenParenthesisRight
TokenAngleLeft
TokenAngleRight
TokenSymbol
TokenEscapeMechanism
)
var tokenName = map[TokenType]string{
TokenInvalid: "INVALID",
TokenEOF: "EOF",
TokenDiv: "Div",
TokenNewline: "Newline",
TokenWhitespace: "Whitespace",
TokenComment: "Comment",
TokenString: "String",
TokenBracketLeft: "BracketLeft",
TokenBracketRight: "BracketRight",
TokenParenthesisLeft: "ParenthesisLeft",
TokenParenthesisRight: "ParenthesisRight",
TokenAngleLeft: "AngleLeft",
TokenAngleRight: "AngleRight",
TokenSymbol: "Symbol",
}
// Token represents a PGN token.
type Token struct {
Line int
Col int
Type TokenType
Value string
}
func (t Token) String() string {
return fmt.Sprintf(
"<Token%s(Line: %d, Col: %d, Value: %q)>",
tokenName[t.Type],
t.Line,
t.Col,
t.Value,
)
}
// LexFn defines the signature of a lexer function.
type LexFn func(*Lexer) LexFn
// Lexer implements a PGN tokenizer.
type Lexer struct {
input *bufio.Reader
output chan *Token
err chan error
line int
start int
pos int
}
// NewLexer returns an initialized Lexer.
func NewLexer(input *bufio.Reader) *Lexer {
l := &Lexer{
input: input,
output: make(chan *Token, 1),
err: make(chan error, 1),
line: 1,
start: 1,
pos: 1,
}
go l.run()
return l
}
func (l *Lexer) run() *Lexer {
go func() {
defer close(l.output)
defer close(l.err)
for fn := lexMain; fn != nil; {
fn = fn(l)
}
}()
return l
}
// Next returns the next Token from the input stream or EOF once the input stream has ended.
func (l *Lexer) Next() (*Token, error) {
select {
case err := <-l.err:
return nil, err
case t := <-l.output:
return t, nil
}
}
// All returns all parsed tokens as []*Token.
func (l *Lexer) All() ([]*Token, error) {
out := []*Token{}
for {
t, err := l.Next()
if err != nil {
return out, err
}
if t == nil || t.Type == TokenEOF {
return out, nil
}
out = append(out, t)
}
}
func (l *Lexer) next() rune {
r, _, err := l.input.ReadRune()
if err != nil {
return EOF
}
l.pos++
return r
}
func (l *Lexer) undo() {
l.input.UnreadRune()
l.pos--
}
func (l *Lexer) peek() rune {
defer l.undo()
return l.next()
}
func (l *Lexer) newToken(t TokenType, v string) *Token {
return &Token{
Line: l.line,
Col: l.start,
Type: t,
Value: v,
}
}
func (l *Lexer) emit(t *Token) {
l.output <- t
l.start = l.pos
}
func (l *Lexer) emitUnexpected(r rune) LexFn {
l.err <- fmt.Errorf(
"unexpected character in line %d at col %d: %v",
l.line,
l.pos,
r,
)
return nil
}
////////////////
//// LEXERS ////
////////////////
func lexMain(l *Lexer) LexFn {
for {
r := l.next()
switch r {
case EOF:
l.emit(l.newToken(TokenEOF, "EOF"))
return nil
case '\n':
return lexNewline
case ' ':
return lexWhitespace
case '%':
if l.pos == 2 {
return lexEscape
}
return l.emitUnexpected(r)
case ';':
return lexCommentUntilNewline
case '{':
return lexComment
case '[':
l.emit(l.newToken(TokenBracketLeft, "["))
case ']':
l.emit(l.newToken(TokenBracketRight, "]"))
case '"':
return lexString
default:
l.undo()
return lexSymbol
}
}
}
func lexNewline(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 255))
out.WriteRune('\n')
for {
r := l.next()
switch r {
case '\n':
out.WriteRune('\n')
default:
l.undo()
l.emit(l.newToken(TokenNewline, out.String()))
l.line += out.Len()
l.start = 1
l.pos = 1
return lexMain
}
}
}
func lexWhitespace(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 255))
out.WriteRune(' ')
for {
r := l.next()
switch r {
case ' ':
out.WriteRune(' ')
default:
l.undo()
l.emit(l.newToken(TokenWhitespace, out.String()))
return lexMain
}
}
}
func lexEscape(l *Lexer) LexFn {
for {
r := l.next()
switch r {
case EOF, '\n':
return lexMain
}
}
}
func lexCommentUntilNewline(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 8192))
for {
r := l.next()
switch r {
case EOF, '\n':
if out.Len() > 0 {
l.emit(l.newToken(TokenComment, out.String()))
}
return lexMain
default:
_, err := out.WriteRune(r)
if err != nil {
panic(err)
}
}
}
}
func lexComment(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 8192))
for {
r := l.next()
switch r {
case EOF:
l.emit(l.newToken(TokenComment, out.String()))
return lexMain
case '\\':
out.WriteRune(l.next())
case '}':
l.emit(l.newToken(TokenComment, out.String()))
return lexMain
default:
out.WriteRune(r)
}
}
}
func lexString(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 4096))
for {
r := l.next()
switch r {
case EOF:
return l.emitUnexpected(r)
case '\\':
out.WriteRune(l.next())
case '"':
l.emit(l.newToken(TokenString, out.String()))
return lexMain
default:
out.WriteRune(r)
}
}
}
func lexSymbol(l *Lexer) LexFn {
out := bytes.NewBuffer(make([]byte, 0, 255))
for {
r := l.next()
switch r {
case EOF:
l.emit(l.newToken(TokenSymbol, out.String()))
l.undo()
return lexMain
case '\n', ' ', '"':
l.undo()
l.emit(l.newToken(TokenSymbol, out.String()))
return lexMain
default:
out.WriteRune(r)
}
}
}