Frames

Lexical scanning in go template

0
1
2
3
4
5
6
7
1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package parse
6
7import (
8 "fmt"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12)
13
14// item represents a token or text string returned from the scanner.
15type item struct {
16 typ itemType // The type of this item.
17 pos Pos // The starting position, in bytes, of this item in the input string.
18 val string // The value of this item.
19 line int // The line number at the start of this item.
20}
21
22func (i item) String() string {
23 switch {
24 case i.typ == itemEOF:
25 return "EOF"
26 case i.typ == itemError:
27 return i.val
28 case i.typ > itemKeyword:
29 return fmt.Sprintf("<%s>", i.val)
30 case len(i.val) > 10:
31 return fmt.Sprintf("%.10q...", i.val)
32 }
33 return fmt.Sprintf("%q", i.val)
34}
35
36// itemType identifies the type of lex items.
37type itemType int
38
39const (
40 itemError itemType = iota // error occurred; value is text of error
41 itemBool // boolean constant
42 itemChar // printable ASCII character; grab bag for comma etc.
43 itemCharConstant // character constant
44 itemComplex // complex constant (1+2i); imaginary is just a number
45 itemAssign // equals ('=') introducing an assignment
46 itemDeclare // colon-equals (':=') introducing a declaration
47 itemEOF
48 itemField // alphanumeric identifier starting with '.'
49 itemIdentifier // alphanumeric identifier not starting with '.'
50 itemLeftDelim // left action delimiter
51 itemLeftParen // '(' inside action
52 itemNumber // simple number, including imaginary
53 itemPipe // pipe symbol
54 itemRawString // raw quoted string (includes quotes)
55 itemRightDelim // right action delimiter
56 itemRightParen // ')' inside action
57 itemSpace // run of spaces separating arguments
58 itemString // quoted string (includes quotes)
59 itemText // plain text
60 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello'
61 // Keywords appear after all the rest.
62 itemKeyword // used only to delimit the keywords
63 itemBlock // block keyword
64 itemDot // the cursor, spelled '.'
65 itemDefine // define keyword
66 itemElse // else keyword
67 itemEnd // end keyword
68 itemIf // if keyword
69 itemNil // the untyped nil constant, easiest to treat as a keyword
70 itemRange // range keyword
71 itemTemplate // template keyword
72 itemWith // with keyword
73)
74
75var key = map[string]itemType{
76 ".": itemDot,
77 "block": itemBlock,
78 "define": itemDefine,
79 "else": itemElse,
80 "end": itemEnd,
81 "if": itemIf,
82 "range": itemRange,
83 "nil": itemNil,
84 "template": itemTemplate,
85 "with": itemWith,
86}
87
88const eof = -1
89
90// Trimming spaces.
91// If the action begins "{{- " rather than "{{", then all space/tab/newlines
92// preceding the action are trimmed; conversely if it ends " -}}" the
93// leading spaces are trimmed. This is done entirely in the lexer; the
94// parser never sees it happen. We require an ASCII space to be
95// present to avoid ambiguity with things like "{{-3}}". It reads
96// better with the space present anyway. For simplicity, only ASCII
97// space does the job.
98const (
99 spaceChars = " \t\r\n" // These are the space characters defined by Go itself.
100 leftTrimMarker = "- " // Attached to left delimiter, trims trailing spaces from preceding text.
101 rightTrimMarker = " -" // Attached to right delimiter, trims leading spaces from following text.
102 trimMarkerLen = Pos(len(leftTrimMarker))
103)
104
105// stateFn represents the state of the scanner as a function that returns the next state.
106type stateFn func(*lexer) stateFn
107
108// lexer holds the state of the scanner.
109type lexer struct {
110 name string // the name of the input; used only for error reports
111 input string // the string being scanned
112 leftDelim string // start of action
113 rightDelim string // end of action
114 trimRightDelim string // end of action with trim marker
115 pos Pos // current position in the input
116 start Pos // start position of this item
117 width Pos // width of last rune read from input
118 items chan item // channel of scanned items
119 parenDepth int // nesting depth of ( ) exprs
120 line int // 1+number of newlines seen
121 startLine int // start line of this item
122}
123
124// next returns the next rune in the input.
125func (l *lexer) next() rune {
126 if int(l.pos) >= len(l.input) {
127 l.width = 0
128 return eof
129 }
130 r, w := utf8.DecodeRuneInString(l.input[l.pos:])
131 l.width = Pos(w)
132 l.pos += l.width
133 if r == '\n' {
134 l.line++
135 }
136 return r
137}
138
139// peek returns but does not consume the next rune in the input.
140func (l *lexer) peek() rune {
141 r := l.next()
142 l.backup()
143 return r
144}
145
146// backup steps back one rune. Can only be called once per call of next.
147func (l *lexer) backup() {
148 l.pos -= l.width
149 // Correct newline count.
150 if l.width == 1 && l.input[l.pos] == '\n' {
151 l.line--
152 }
153}
154
155// emit passes an item back to the client.
156func (l *lexer) emit(t itemType) {
157 l.items <- item{t, l.start, l.input[l.start:l.pos], l.startLine}
158 l.start = l.pos
159 l.startLine = l.line
160}
161
162// ignore skips over the pending input before this point.
163func (l *lexer) ignore() {
164 l.line += strings.Count(l.input[l.start:l.pos], "\n")
165 l.start = l.pos
166 l.startLine = l.line
167}
168
169// accept consumes the next rune if it's from the valid set.
170func (l *lexer) accept(valid string) bool {
171 if strings.ContainsRune(valid, l.next()) {
172 return true
173 }
174 l.backup()
175 return false
176}
177
178// acceptRun consumes a run of runes from the valid set.
179func (l *lexer) acceptRun(valid string) {
180 for strings.ContainsRune(valid, l.next()) {
181 }
182 l.backup()
183}
184
185// errorf returns an error token and terminates the scan by passing
186// back a nil pointer that will be the next state, terminating l.nextItem.
187func (l *lexer) errorf(format string, args ...interface{}) stateFn {
188 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine}
189 return nil
190}
191
192// nextItem returns the next item from the input.
193// Called by the parser, not in the lexing goroutine.
194func (l *lexer) nextItem() item {
195 return <-l.items
196}
197
198// drain drains the output so the lexing goroutine will exit.
199// Called by the parser, not in the lexing goroutine.
200func (l *lexer) drain() {
201 for range l.items {
202 }
203}
204
205// lex creates a new scanner for the input string.
206func lex(name, input, left, right string) *lexer {
207 if left == "" {
208 left = leftDelim
209 }
210 if right == "" {
211 right = rightDelim
212 }
213 l := &lexer{
214 name: name,
215 input: input,
216 leftDelim: left,
217 rightDelim: right,
218 trimRightDelim: rightTrimMarker + right,
219 items: make(chan item),
220 line: 1,
221 startLine: 1,
222 }
223 go l.run()
224 return l
225}
226
227// run runs the state machine for the lexer.
228func (l *lexer) run() {
229 for state := lexText; state != nil; {
230 state = state(l)
231 }
232 close(l.items)
233}
234
235// state functions
236
237const (
238 leftDelim = "{{"
239 rightDelim = "}}"
240 leftComment = "/*"
241 rightComment = "*/"
242)
243
244// lexText scans until an opening action delimiter, "{{".
245func lexText(l *lexer) stateFn {
246 l.width = 0
247 if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 {
248 ldn := Pos(len(l.leftDelim))
249 l.pos += Pos(x)
250 trimLength := Pos(0)
251 if strings.HasPrefix(l.input[l.pos+ldn:], leftTrimMarker) {
252 trimLength = rightTrimLength(l.input[l.start:l.pos])
253 }
254 l.pos -= trimLength
255 if l.pos > l.start {
256 l.line += strings.Count(l.input[l.start:l.pos], "\n")
257 l.emit(itemText)
258 }
259 l.pos += trimLength
260 l.ignore()
261 return lexLeftDelim
262 }
263 l.pos = Pos(len(l.input))
264 // Correctly reached EOF.
265 if l.pos > l.start {
266 l.line += strings.Count(l.input[l.start:l.pos], "\n")
267 l.emit(itemText)
268 }
269 l.emit(itemEOF)
270 return nil
271}
272
273// rightTrimLength returns the length of the spaces at the end of the string.
274func rightTrimLength(s string) Pos {
275 return Pos(len(s) - len(strings.TrimRight(s, spaceChars)))
276}
277
278// atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
279func (l *lexer) atRightDelim() (delim, trimSpaces bool) {
280 if strings.HasPrefix(l.input[l.pos:], l.trimRightDelim) { // With trim marker.
281 return true, true
282 }
283 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker.
284 return true, false
285 }
286 return false, false
287}
288
289// leftTrimLength returns the length of the spaces at the beginning of the string.
290func leftTrimLength(s string) Pos {
291 return Pos(len(s) - len(strings.TrimLeft(s, spaceChars)))
292}
293
294// lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker.
295func lexLeftDelim(l *lexer) stateFn {
296 l.pos += Pos(len(l.leftDelim))
297 trimSpace := strings.HasPrefix(l.input[l.pos:], leftTrimMarker)
298 afterMarker := Pos(0)
299 if trimSpace {
300 afterMarker = trimMarkerLen
301 }
302 if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) {
303 l.pos += afterMarker
304 l.ignore()
305 return lexComment
306 }
307 l.emit(itemLeftDelim)
308 l.pos += afterMarker
309 l.ignore()
310 l.parenDepth = 0
311 return lexInsideAction
312}
313
314// lexComment scans a comment. The left comment marker is known to be present.
315func lexComment(l *lexer) stateFn {
316 l.pos += Pos(len(leftComment))
317 i := strings.Index(l.input[l.pos:], rightComment)
318 if i < 0 {
319 return l.errorf("unclosed comment")
320 }
321 l.pos += Pos(i + len(rightComment))
322 delim, trimSpace := l.atRightDelim()
323 if !delim {
324 return l.errorf("comment ends before closing delimiter")
325 }
326 if trimSpace {
327 l.pos += trimMarkerLen
328 }
329 l.pos += Pos(len(l.rightDelim))
330 if trimSpace {
331 l.pos += leftTrimLength(l.input[l.pos:])
332 }
333 l.ignore()
334 return lexText
335}
336
337// lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker.
338func lexRightDelim(l *lexer) stateFn {
339 trimSpace := strings.HasPrefix(l.input[l.pos:], rightTrimMarker)
340 if trimSpace {
341 l.pos += trimMarkerLen
342 l.ignore()
343 }
344 l.pos += Pos(len(l.rightDelim))
345 l.emit(itemRightDelim)
346 if trimSpace {
347 l.pos += leftTrimLength(l.input[l.pos:])
348 l.ignore()
349 }
350 return lexText
351}
352
353// lexInsideAction scans the elements inside action delimiters.
354func lexInsideAction(l *lexer) stateFn {
355 // Either number, quoted string, or identifier.
356 // Spaces separate arguments; runs of spaces turn into itemSpace.
357 // Pipe symbols separate and are emitted.
358 delim, _ := l.atRightDelim()
359 if delim {
360 if l.parenDepth == 0 {
361 return lexRightDelim
362 }
363 return l.errorf("unclosed left paren")
364 }
365 switch r := l.next(); {
366 case r == eof || isEndOfLine(r):
367 return l.errorf("unclosed action")
368 case isSpace(r):
369 l.backup() // Put space back in case we have " -}}".
370 return lexSpace
371 case r == '=':
372 l.emit(itemAssign)
373 case r == ':':
374 if l.next() != '=' {
375 return l.errorf("expected :=")
376 }
377 l.emit(itemDeclare)
378 case r == '|':
379 l.emit(itemPipe)
380 case r == '"':
381 return lexQuote
382 case r == '`':
383 return lexRawQuote
384 case r == '$':
385 return lexVariable
386 case r == '\'':
387 return lexChar
388 case r == '.':
389 // special look-ahead for ".field" so we don't break l.backup().
390 if l.pos < Pos(len(l.input)) {
391 r := l.input[l.pos]
392 if r < '0' || '9' < r {
393 return lexField
394 }
395 }
396 fallthrough // '.' can start a number.
397 case r == '+' || r == '-' || ('0' <= r && r <= '9'):
398 l.backup()
399 return lexNumber
400 case isAlphaNumeric(r):
401 l.backup()
402 return lexIdentifier
403 case r == '(':
404 l.emit(itemLeftParen)
405 l.parenDepth++
406 case r == ')':
407 l.emit(itemRightParen)
408 l.parenDepth--
409 if l.parenDepth < 0 {
410 return l.errorf("unexpected right paren %#U", r)
411 }
412 case r <= unicode.MaxASCII && unicode.IsPrint(r):
413 l.emit(itemChar)
414 return lexInsideAction
415 default:
416 return l.errorf("unrecognized character in action: %#U", r)
417 }
418 return lexInsideAction
419}
420
421// lexSpace scans a run of space characters.
422// We have not consumed the first space, which is known to be present.
423// Take care if there is a trim-marked right delimiter, which starts with a space.
424func lexSpace(l *lexer) stateFn {
425 var r rune
426 var numSpaces int
427 for {
428 r = l.peek()
429 if !isSpace(r) {
430 break
431 }
432 l.next()
433 numSpaces++
434 }
435 // Be careful about a trim-marked closing delimiter, which has a minus
436 // after a space. We know there is a space, so check for the '-' that might follow.
437 if strings.HasPrefix(l.input[l.pos-1:], l.trimRightDelim) {
438 l.backup() // Before the space.
439 if numSpaces == 1 {
440 return lexRightDelim // On the delim, so go right to that.
441 }
442 }
443 l.emit(itemSpace)
444 return lexInsideAction
445}
446
447// lexIdentifier scans an alphanumeric.
448func lexIdentifier(l *lexer) stateFn {
449Loop:
450 for {
451 switch r := l.next(); {
452 case isAlphaNumeric(r):
453 // absorb.
454 default:
455 l.backup()
456 word := l.input[l.start:l.pos]
457 if !l.atTerminator() {
458 return l.errorf("bad character %#U", r)
459 }
460 switch {
461 case key[word] > itemKeyword:
462 l.emit(key[word])
463 case word[0] == '.':
464 l.emit(itemField)
465 case word == "true", word == "false":
466 l.emit(itemBool)
467 default:
468 l.emit(itemIdentifier)
469 }
470 break Loop
471 }
472 }
473 return lexInsideAction
474}
475
476// lexField scans a field: .Alphanumeric.
477// The . has been scanned.
478func lexField(l *lexer) stateFn {
479 return lexFieldOrVariable(l, itemField)
480}
481
482// lexVariable scans a Variable: $Alphanumeric.
483// The $ has been scanned.
484func lexVariable(l *lexer) stateFn {
485 if l.atTerminator() { // Nothing interesting follows -> "$".
486 l.emit(itemVariable)
487 return lexInsideAction
488 }
489 return lexFieldOrVariable(l, itemVariable)
490}
491
492// lexVariable scans a field or variable: [.$]Alphanumeric.
493// The . or $ has been scanned.
494func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
495 if l.atTerminator() { // Nothing interesting follows -> "." or "$".
496 if typ == itemVariable {
497 l.emit(itemVariable)
498 } else {
499 l.emit(itemDot)
500 }
501 return lexInsideAction
502 }
503 var r rune
504 for {
505 r = l.next()
506 if !isAlphaNumeric(r) {
507 l.backup()
508 break
509 }
510 }
511 if !l.atTerminator() {
512 return l.errorf("bad character %#U", r)
513 }
514 l.emit(typ)
515 return lexInsideAction
516}
517
518// atTerminator reports whether the input is at valid termination character to
519// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
520// like "$x+2" not being acceptable without a space, in case we decide one
521// day to implement arithmetic.
522func (l *lexer) atTerminator() bool {
523 r := l.peek()
524 if isSpace(r) || isEndOfLine(r) {
525 return true
526 }
527 switch r {
528 case eof, '.', ',', '|', ':', ')', '(':
529 return true
530 }
531 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
532 // succeed but should fail) but only in extremely rare cases caused by willfully
533 // bad choice of delimiter.
534 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
535 return true
536 }
537 return false
538}
539
540// lexChar scans a character constant. The initial quote is already
541// scanned. Syntax checking is done by the parser.
542func lexChar(l *lexer) stateFn {
543Loop:
544 for {
545 switch l.next() {
546 case '\\':
547 if r := l.next(); r != eof && r != '\n' {
548 break
549 }
550 fallthrough
551 case eof, '\n':
552 return l.errorf("unterminated character constant")
553 case '\'':
554 break Loop
555 }
556 }
557 l.emit(itemCharConstant)
558 return lexInsideAction
559}
560
561// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
562// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
563// and "089" - but when it's wrong the input is invalid and the parser (via
564// strconv) will notice.
565func lexNumber(l *lexer) stateFn {
566 if !l.scanNumber() {
567 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
568 }
569 if sign := l.peek(); sign == '+' || sign == '-' {
570 // Complex: 1+2i. No spaces, must end in 'i'.
571 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
572 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
573 }
574 l.emit(itemComplex)
575 } else {
576 l.emit(itemNumber)
577 }
578 return lexInsideAction
579}
580
581func (l *lexer) scanNumber() bool {
582 // Optional leading sign.
583 l.accept("+-")
584 // Is it hex?
585 digits := "0123456789_"
586 if l.accept("0") {
587 // Note: Leading 0 does not mean octal in floats.
588 if l.accept("xX") {
589 digits = "0123456789abcdefABCDEF_"
590 } else if l.accept("oO") {
591 digits = "01234567_"
592 } else if l.accept("bB") {
593 digits = "01_"
594 }
595 }
596 l.acceptRun(digits)
597 if l.accept(".") {
598 l.acceptRun(digits)
599 }
600 if len(digits) == 10+1 && l.accept("eE") {
601 l.accept("+-")
602 l.acceptRun("0123456789_")
603 }
604 if len(digits) == 16+6+1 && l.accept("pP") {
605 l.accept("+-")
606 l.acceptRun("0123456789_")
607 }
608 // Is it imaginary?
609 l.accept("i")
610 // Next thing mustn't be alphanumeric.
611 if isAlphaNumeric(l.peek()) {
612 l.next()
613 return false
614 }
615 return true
616}
617
618// lexQuote scans a quoted string.
619func lexQuote(l *lexer) stateFn {
620Loop:
621 for {
622 switch l.next() {
623 case '\\':
624 if r := l.next(); r != eof && r != '\n' {
625 break
626 }
627 fallthrough
628 case eof, '\n':
629 return l.errorf("unterminated quoted string")
630 case '"':
631 break Loop
632 }
633 }
634 l.emit(itemString)
635 return lexInsideAction
636}
637
638// lexRawQuote scans a raw quoted string.
639func lexRawQuote(l *lexer) stateFn {
640Loop:
641 for {
642 switch l.next() {
643 case eof:
644 return l.errorf("unterminated raw quoted string")
645 case '`':
646 break Loop
647 }
648 }
649 l.emit(itemRawString)
650 return lexInsideAction
651}
652
653// isSpace reports whether r is a space character.
654func isSpace(r rune) bool {
655 return r == ' ' || r == '\t'
656}
657
658// isEndOfLine reports whether r is an end-of-line character.
659func isEndOfLine(r rune) bool {
660 return r == '\r' || r == '\n'
661}
662
663// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
664func isAlphaNumeric(r rune) bool {
665 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
666}
667
Golang template parser / lexer code is an interesting piece of code, because it does not follow the typical lex implementation. There is an awesome talk about this here by Rob Pike. This story steps through the code to show how it is implemented.