libchess/pkg/pgn/move/lexer.go

package move

import (
	"bufio"
	"bytes"
	"fmt"
	"strings"

	"code.c-base.org/gochess/libchess/pkg/board"
)

var tokenFactory = make(chan *Token, 128)

func init() {
	go func() {
		for {
			tokenFactory <- &Token{}
		}
	}()
}

// TokenType defines the type of a token
type TokenType uint8

// The following TokenTypes exist
const (
	TokenError TokenType = iota
	TokenEOF
	TokenPiece
	TokenFile
	TokenRank
	TokenCapture
	TokenSquare
	TokenCheck
	TokenMate
	TokenCastles
)

// eof signals the end of a move
const eof = -1

// Token represents a move token.
type Token struct {
	Pos   int       // character column of this token
	Type  TokenType // type (see above)
	Value string    // literal value
}

// Lexer implements a lexer for tokenizing PGN formatted moves.
type Lexer struct {
	input  *bufio.Reader // buffered io for streaming the input
	tokens chan Token    // output channel
	start  int           // starting position of the current token
	pos    int           // current scanning position
	buf    *Token
}

// NewLexer returns an initialized Lexer.
func NewLexer(input string) *Lexer {
	l := &Lexer{
		input:  bufio.NewReader(strings.NewReader(input)),
		start:  1,
		pos:    1,
		tokens: make(chan Token, 1),
	}
	go l.scan()
	return l
}

// NextToken returns the next token from the input string.
func (l *Lexer) NextToken() Token {
	return <-l.tokens
}

// emit emits the given token to the output channel.
func (l *Lexer) emit(t Token) {
	// When encountering a token of type TokenFile *[a-h]*, it needs to be buffered and compared to
	// the next token, which may be of type TokenRank *[1-8]* combining them into a token of type
	// TokenSquare.
	if l.buf == nil {
		// check for TokenFile and buffer it
		if t.Type == TokenFile {
			l.buf = &t
		} else {
			l.tokens <- t
			l.start = l.pos
		}
	} else {
		// grab the last token off the buffer
		prev := l.buf
		l.buf = nil
		// TokenFile followed by TokenRank combines to TokenSquare
		if t.Type == TokenRank {
			strSq := fmt.Sprintf("%s%s", prev.Value, t.Value)
			_, ok := board.StrToSquareMap[strSq]
			if !ok {
				// technically this should not be reached, but I'm handling it anyways, just in case
				l.tokens <- *prev
				l.tokens <- t
			} else {
				// emit TokenSquare instead of individual TokenFile & TokenRank
				l.tokens <- Token{
					Pos:   l.start,
					Type:  TokenSquare,
					Value: strSq,
				}
			}
		}
	}
}

// next reads the next rune from the buffered input stream
func (l *Lexer) next() rune {
	r, _, err := l.input.ReadRune()
	if err != nil {
		return eof
	}
	l.pos++
	return r
}

func (l *Lexer) undo() {
	l.input.UnreadRune()
	l.pos--
}

// newToken is a helper for easily initializing Tokens with the correct values.
func (l *Lexer) newToken(tokType TokenType, v string) Token {
	t := <-tokenFactory
	t.Pos = l.start
	t.Type = tokType
	t.Value = v
	return *t
}

// scan scans for tokens and emits them to the output channel until the end of the input stream is
// reached.
func (l *Lexer) scan() {
	defer close(l.tokens)
	for {
		r := l.next()
		switch r {
		case eof:
			l.emit(l.newToken(TokenEOF, "eof"))
			return
		case 'O', '0':
			l.undo()
			m := lexCastles(l)
			if m == "" {
				l.emit(l.newToken(TokenError, m))
			} else {
				l.emit(l.newToken(TokenCastles, m))
			}
		case 'K', 'Q', 'B', 'N', 'R':
			l.emit(l.newToken(TokenPiece, string(r)))
		case 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h':
			l.emit(l.newToken(TokenFile, string(r)))
		case '1', '2', '3', '4', '5', '6', '7', '8':
			l.emit(l.newToken(TokenRank, string(r)))
		case '+':
			l.emit(l.newToken(TokenCheck, string(r)))
		case '#':
			l.emit(l.newToken(TokenMate, string(r)))
		case 'x':
			l.emit(l.newToken(TokenCapture, string(r)))
		case '=':
			// noop
		default:
			l.emit(l.newToken(TokenError, string(r)))
			return
		}
	}
}

func lexCastles(l *Lexer) string {
	var (
		buf = make([]byte, 0, 5)
		out = bytes.NewBuffer(buf)
		c   = 0
	)
	for {
		r := l.next()
		switch {
		case c == 5:
			m := out.String()
			switch m {
			case "O-O", "0-0":
				return "O-O"
			case "O-O-O", "0-0-0":
				return "O-O-O"
			default:
				return ""
			}
		case r == 'O', r == '-':
			out.WriteRune(r)
		}
		c++
	}
}
:sparkles: Simple PGN parser 2020-05-08 20:55:25 +00:00			`package move`

			`import (`
			`"bufio"`
			`"bytes"`
			`"fmt"`
			`"strings"`

			`"code.c-base.org/gochess/libchess/pkg/board"`
			`)`

			`var tokenFactory = make(chan *Token, 128)`

			`func init() {`
			`go func() {`
			`for {`
			`tokenFactory <- &Token{}`
			`}`
			`}()`
			`}`

			`// TokenType defines the type of a token`
			`type TokenType uint8`

			`// The following TokenTypes exist`
			`const (`
			`TokenError TokenType = iota`
			`TokenEOF`
			`TokenPiece`
			`TokenFile`
			`TokenRank`
			`TokenCapture`
			`TokenSquare`
			`TokenCheck`
			`TokenMate`
			`TokenCastles`
			`)`

			`// eof signals the end of a move`
			`const eof = -1`

			`// Token represents a move token.`
			`type Token struct {`
			`Pos int // character column of this token`
			`Type TokenType // type (see above)`
			`Value string // literal value`
			`}`

			`// Lexer implements a lexer for tokenizing PGN formatted moves.`
			`type Lexer struct {`
			`input *bufio.Reader // buffered io for streaming the input`
			`tokens chan Token // output channel`
			`start int // starting position of the current token`
			`pos int // current scanning position`
			`buf *Token`
			`}`

			`// NewLexer returns an initialized Lexer.`
			`func NewLexer(input string) *Lexer {`
			`l := &Lexer{`
			`input: bufio.NewReader(strings.NewReader(input)),`
			`start: 1,`
			`pos: 1,`
			`tokens: make(chan Token, 1),`
			`}`
			`go l.scan()`
			`return l`
			`}`

			`// NextToken returns the next token from the input string.`
			`func (l *Lexer) NextToken() Token {`
			`return <-l.tokens`
			`}`

			`// emit emits the given token to the output channel.`
			`func (l *Lexer) emit(t Token) {`
			`// When encountering a token of type TokenFile [a-h], it needs to be buffered and compared to`
			`// the next token, which may be of type TokenRank [1-8] combining them into a token of type`
			`// TokenSquare.`
			`if l.buf == nil {`
			`// check for TokenFile and buffer it`
			`if t.Type == TokenFile {`
			`l.buf = &t`
			`} else {`
			`l.tokens <- t`
			`l.start = l.pos`
			`}`
			`} else {`
			`// grab the last token off the buffer`
			`prev := l.buf`
			`l.buf = nil`
			`// TokenFile followed by TokenRank combines to TokenSquare`
			`if t.Type == TokenRank {`
			`strSq := fmt.Sprintf("%s%s", prev.Value, t.Value)`
			`_, ok := board.StrToSquareMap[strSq]`
			`if !ok {`
			`// technically this should not be reached, but I'm handling it anyways, just in case`
			`l.tokens <- *prev`
			`l.tokens <- t`
			`} else {`
			`// emit TokenSquare instead of individual TokenFile & TokenRank`
			`l.tokens <- Token{`
			`Pos: l.start,`
			`Type: TokenSquare,`
			`Value: strSq,`
			`}`
			`}`
			`}`
			`}`
			`}`

			`// next reads the next rune from the buffered input stream`
			`func (l *Lexer) next() rune {`
			`r, _, err := l.input.ReadRune()`
			`if err != nil {`
			`return eof`
			`}`
			`l.pos++`
			`return r`
			`}`

			`func (l *Lexer) undo() {`
			`l.input.UnreadRune()`
			`l.pos--`
			`}`

			`// newToken is a helper for easily initializing Tokens with the correct values.`
			`func (l *Lexer) newToken(tokType TokenType, v string) Token {`
			`t := <-tokenFactory`
			`t.Pos = l.start`
			`t.Type = tokType`
			`t.Value = v`
			`return *t`
			`}`

			`// scan scans for tokens and emits them to the output channel until the end of the input stream is`
			`// reached.`
			`func (l *Lexer) scan() {`
			`defer close(l.tokens)`
			`for {`
			`r := l.next()`
			`switch r {`
			`case eof:`
			`l.emit(l.newToken(TokenEOF, "eof"))`
			`return`
			`case 'O', '0':`
			`l.undo()`
			`m := lexCastles(l)`
			`if m == "" {`
			`l.emit(l.newToken(TokenError, m))`
			`} else {`
			`l.emit(l.newToken(TokenCastles, m))`
			`}`
			`case 'K', 'Q', 'B', 'N', 'R':`
			`l.emit(l.newToken(TokenPiece, string(r)))`
			`case 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h':`
			`l.emit(l.newToken(TokenFile, string(r)))`
			`case '1', '2', '3', '4', '5', '6', '7', '8':`
			`l.emit(l.newToken(TokenRank, string(r)))`
			`case '+':`
			`l.emit(l.newToken(TokenCheck, string(r)))`
			`case '#':`
			`l.emit(l.newToken(TokenMate, string(r)))`
			`case 'x':`
			`l.emit(l.newToken(TokenCapture, string(r)))`
			`case '=':`
			`// noop`
			`default:`
			`l.emit(l.newToken(TokenError, string(r)))`
			`return`
			`}`
			`}`
			`}`

			`func lexCastles(l *Lexer) string {`
			`var (`
			`buf = make([]byte, 0, 5)`
			`out = bytes.NewBuffer(buf)`
			`c = 0`
			`)`
			`for {`
			`r := l.next()`
			`switch {`
			`case c == 5:`
			`m := out.String()`
			`switch m {`
			`case "O-O", "0-0":`
			`return "O-O"`
			`case "O-O-O", "0-0-0":`
			`return "O-O-O"`
			`default:`
			`return ""`
			`}`
			`case r == 'O', r == '-':`
			`out.WriteRune(r)`
			`}`
			`c++`
			`}`
			`}`