...

Source file src/cmd/compile/internal/syntax/scanner.go

Documentation: cmd/compile/internal/syntax

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements scanner, a lexical tokenizer for
     6  // Go source. After initialization, consecutive calls of
     7  // next advance the scanner one token at a time.
     8  //
     9  // This file, source.go, tokens.go, and token_string.go are self-contained
    10  // (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
    11  // and thus could be made into their own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  // The mode flags below control which comments are reported
    23  // by calling the error handler. If no flag is set, comments
    24  // are ignored.
    25  const (
    26  	comments   uint = 1 << iota // call handler for all comments
    27  	directives                  // call handler for directives only
    28  )
    29  
    30  type scanner struct {
    31  	source
    32  	mode   uint
    33  	nlsemi bool // if set '\n' and EOF translate to ';'
    34  
    35  	// current token, valid after calling next()
    36  	line, col uint
    37  	blank     bool // line is blank up to col
    38  	tok       token
    39  	lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF"); may be malformed if bad is true
    40  	bad       bool     // valid if tok is _Literal, true if a syntax error occurred, lit may be malformed
    41  	kind      LitKind  // valid if tok is _Literal
    42  	op        Operator // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
    43  	prec      int      // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
    44  }
    45  
    46  func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
    47  	s.source.init(src, errh)
    48  	s.mode = mode
    49  	s.nlsemi = false
    50  }
    51  
    52  // errorf reports an error at the most recently read character position.
    53  func (s *scanner) errorf(format string, args ...interface{}) {
    54  	s.error(fmt.Sprintf(format, args...))
    55  }
    56  
    57  // errorAtf reports an error at a byte column offset relative to the current token start.
    58  func (s *scanner) errorAtf(offset int, format string, args ...interface{}) {
    59  	s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...))
    60  }
    61  
    62  // setLit sets the scanner state for a recognized _Literal token.
    63  func (s *scanner) setLit(kind LitKind, ok bool) {
    64  	s.nlsemi = true
    65  	s.tok = _Literal
    66  	s.lit = string(s.segment())
    67  	s.bad = !ok
    68  	s.kind = kind
    69  }
    70  
    71  // next advances the scanner by reading the next token.
    72  //
    73  // If a read, source encoding, or lexical error occurs, next calls
    74  // the installed error handler with the respective error position
    75  // and message. The error message is guaranteed to be non-empty and
    76  // never starts with a '/'. The error handler must exist.
    77  //
    78  // If the scanner mode includes the comments flag and a comment
    79  // (including comments containing directives) is encountered, the
    80  // error handler is also called with each comment position and text
    81  // (including opening /* or // and closing */, but without a newline
    82  // at the end of line comments). Comment text always starts with a /
    83  // which can be used to distinguish these handler calls from errors.
    84  //
    85  // If the scanner mode includes the directives (but not the comments)
    86  // flag, only comments containing a //line, /*line, or //go: directive
    87  // are reported, in the same way as regular comments.
    88  func (s *scanner) next() {
    89  	nlsemi := s.nlsemi
    90  	s.nlsemi = false
    91  
    92  redo:
    93  	// skip white space
    94  	s.stop()
    95  	startLine, startCol := s.pos()
    96  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
    97  		s.nextch()
    98  	}
    99  
   100  	// token start
   101  	s.line, s.col = s.pos()
   102  	s.blank = s.line > startLine || startCol == colbase
   103  	s.start()
   104  	if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
   105  		s.nextch()
   106  		s.ident()
   107  		return
   108  	}
   109  
   110  	switch s.ch {
   111  	case -1:
   112  		if nlsemi {
   113  			s.lit = "EOF"
   114  			s.tok = _Semi
   115  			break
   116  		}
   117  		s.tok = _EOF
   118  
   119  	case '\n':
   120  		s.nextch()
   121  		s.lit = "newline"
   122  		s.tok = _Semi
   123  
   124  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   125  		s.number(false)
   126  
   127  	case '"':
   128  		s.stdString()
   129  
   130  	case '`':
   131  		s.rawString()
   132  
   133  	case '\'':
   134  		s.rune()
   135  
   136  	case '(':
   137  		s.nextch()
   138  		s.tok = _Lparen
   139  
   140  	case '[':
   141  		s.nextch()
   142  		s.tok = _Lbrack
   143  
   144  	case '{':
   145  		s.nextch()
   146  		s.tok = _Lbrace
   147  
   148  	case ',':
   149  		s.nextch()
   150  		s.tok = _Comma
   151  
   152  	case ';':
   153  		s.nextch()
   154  		s.lit = "semicolon"
   155  		s.tok = _Semi
   156  
   157  	case ')':
   158  		s.nextch()
   159  		s.nlsemi = true
   160  		s.tok = _Rparen
   161  
   162  	case ']':
   163  		s.nextch()
   164  		s.nlsemi = true
   165  		s.tok = _Rbrack
   166  
   167  	case '}':
   168  		s.nextch()
   169  		s.nlsemi = true
   170  		s.tok = _Rbrace
   171  
   172  	case ':':
   173  		s.nextch()
   174  		if s.ch == '=' {
   175  			s.nextch()
   176  			s.tok = _Define
   177  			break
   178  		}
   179  		s.tok = _Colon
   180  
   181  	case '.':
   182  		s.nextch()
   183  		if isDecimal(s.ch) {
   184  			s.number(true)
   185  			break
   186  		}
   187  		if s.ch == '.' {
   188  			s.nextch()
   189  			if s.ch == '.' {
   190  				s.nextch()
   191  				s.tok = _DotDotDot
   192  				break
   193  			}
   194  			s.rewind() // now s.ch holds 1st '.'
   195  			s.nextch() // consume 1st '.' again
   196  		}
   197  		s.tok = _Dot
   198  
   199  	case '+':
   200  		s.nextch()
   201  		s.op, s.prec = Add, precAdd
   202  		if s.ch != '+' {
   203  			goto assignop
   204  		}
   205  		s.nextch()
   206  		s.nlsemi = true
   207  		s.tok = _IncOp
   208  
   209  	case '-':
   210  		s.nextch()
   211  		s.op, s.prec = Sub, precAdd
   212  		if s.ch != '-' {
   213  			goto assignop
   214  		}
   215  		s.nextch()
   216  		s.nlsemi = true
   217  		s.tok = _IncOp
   218  
   219  	case '*':
   220  		s.nextch()
   221  		s.op, s.prec = Mul, precMul
   222  		// don't goto assignop - want _Star token
   223  		if s.ch == '=' {
   224  			s.nextch()
   225  			s.tok = _AssignOp
   226  			break
   227  		}
   228  		s.tok = _Star
   229  
   230  	case '/':
   231  		s.nextch()
   232  		if s.ch == '/' {
   233  			s.nextch()
   234  			s.lineComment()
   235  			goto redo
   236  		}
   237  		if s.ch == '*' {
   238  			s.nextch()
   239  			s.fullComment()
   240  			if line, _ := s.pos(); line > s.line && nlsemi {
   241  				// A multi-line comment acts like a newline;
   242  				// it translates to a ';' if nlsemi is set.
   243  				s.lit = "newline"
   244  				s.tok = _Semi
   245  				break
   246  			}
   247  			goto redo
   248  		}
   249  		s.op, s.prec = Div, precMul
   250  		goto assignop
   251  
   252  	case '%':
   253  		s.nextch()
   254  		s.op, s.prec = Rem, precMul
   255  		goto assignop
   256  
   257  	case '&':
   258  		s.nextch()
   259  		if s.ch == '&' {
   260  			s.nextch()
   261  			s.op, s.prec = AndAnd, precAndAnd
   262  			s.tok = _Operator
   263  			break
   264  		}
   265  		s.op, s.prec = And, precMul
   266  		if s.ch == '^' {
   267  			s.nextch()
   268  			s.op = AndNot
   269  		}
   270  		goto assignop
   271  
   272  	case '|':
   273  		s.nextch()
   274  		if s.ch == '|' {
   275  			s.nextch()
   276  			s.op, s.prec = OrOr, precOrOr
   277  			s.tok = _Operator
   278  			break
   279  		}
   280  		s.op, s.prec = Or, precAdd
   281  		goto assignop
   282  
   283  	case '^':
   284  		s.nextch()
   285  		s.op, s.prec = Xor, precAdd
   286  		goto assignop
   287  
   288  	case '<':
   289  		s.nextch()
   290  		if s.ch == '=' {
   291  			s.nextch()
   292  			s.op, s.prec = Leq, precCmp
   293  			s.tok = _Operator
   294  			break
   295  		}
   296  		if s.ch == '<' {
   297  			s.nextch()
   298  			s.op, s.prec = Shl, precMul
   299  			goto assignop
   300  		}
   301  		if s.ch == '-' {
   302  			s.nextch()
   303  			s.tok = _Arrow
   304  			break
   305  		}
   306  		s.op, s.prec = Lss, precCmp
   307  		s.tok = _Operator
   308  
   309  	case '>':
   310  		s.nextch()
   311  		if s.ch == '=' {
   312  			s.nextch()
   313  			s.op, s.prec = Geq, precCmp
   314  			s.tok = _Operator
   315  			break
   316  		}
   317  		if s.ch == '>' {
   318  			s.nextch()
   319  			s.op, s.prec = Shr, precMul
   320  			goto assignop
   321  		}
   322  		s.op, s.prec = Gtr, precCmp
   323  		s.tok = _Operator
   324  
   325  	case '=':
   326  		s.nextch()
   327  		if s.ch == '=' {
   328  			s.nextch()
   329  			s.op, s.prec = Eql, precCmp
   330  			s.tok = _Operator
   331  			break
   332  		}
   333  		s.tok = _Assign
   334  
   335  	case '!':
   336  		s.nextch()
   337  		if s.ch == '=' {
   338  			s.nextch()
   339  			s.op, s.prec = Neq, precCmp
   340  			s.tok = _Operator
   341  			break
   342  		}
   343  		s.op, s.prec = Not, 0
   344  		s.tok = _Operator
   345  
   346  	case '~':
   347  		s.nextch()
   348  		s.op, s.prec = Tilde, 0
   349  		s.tok = _Operator
   350  
   351  	default:
   352  		s.errorf("invalid character %#U", s.ch)
   353  		s.nextch()
   354  		goto redo
   355  	}
   356  
   357  	return
   358  
   359  assignop:
   360  	if s.ch == '=' {
   361  		s.nextch()
   362  		s.tok = _AssignOp
   363  		return
   364  	}
   365  	s.tok = _Operator
   366  }
   367  
   368  func (s *scanner) ident() {
   369  	// accelerate common case (7bit ASCII)
   370  	for isLetter(s.ch) || isDecimal(s.ch) {
   371  		s.nextch()
   372  	}
   373  
   374  	// general case
   375  	if s.ch >= utf8.RuneSelf {
   376  		for s.atIdentChar(false) {
   377  			s.nextch()
   378  		}
   379  	}
   380  
   381  	// possibly a keyword
   382  	lit := s.segment()
   383  	if len(lit) >= 2 {
   384  		if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
   385  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   386  			s.tok = tok
   387  			return
   388  		}
   389  	}
   390  
   391  	s.nlsemi = true
   392  	s.lit = string(lit)
   393  	s.tok = _Name
   394  }
   395  
   396  // tokStrFast is a faster version of token.String, which assumes that tok
   397  // is one of the valid tokens - and can thus skip bounds checks.
   398  func tokStrFast(tok token) string {
   399  	return _token_name[_token_index[tok-1]:_token_index[tok]]
   400  }
   401  
   402  func (s *scanner) atIdentChar(first bool) bool {
   403  	switch {
   404  	case unicode.IsLetter(s.ch) || s.ch == '_':
   405  		// ok
   406  	case unicode.IsDigit(s.ch):
   407  		if first {
   408  			s.errorf("identifier cannot begin with digit %#U", s.ch)
   409  		}
   410  	case s.ch >= utf8.RuneSelf:
   411  		s.errorf("invalid character %#U in identifier", s.ch)
   412  	default:
   413  		return false
   414  	}
   415  	return true
   416  }
   417  
   418  // hash is a perfect hash function for keywords.
   419  // It assumes that s has at least length 2.
   420  func hash(s []byte) uint {
   421  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   422  }
   423  
   424  var keywordMap [1 << 6]token // size must be power of two
   425  
   426  func init() {
   427  	// populate keywordMap
   428  	for tok := _Break; tok <= _Var; tok++ {
   429  		h := hash([]byte(tok.String()))
   430  		if keywordMap[h] != 0 {
   431  			panic("imperfect hash")
   432  		}
   433  		keywordMap[h] = tok
   434  	}
   435  }
   436  
   437  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   438  func isLetter(ch rune) bool  { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
   439  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   440  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   441  
   442  // digits accepts the sequence { digit | '_' }.
   443  // If base <= 10, digits accepts any decimal digit but records
   444  // the index (relative to the literal start) of a digit >= base
   445  // in *invalid, if *invalid < 0.
   446  // digits returns a bitset describing whether the sequence contained
   447  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   448  func (s *scanner) digits(base int, invalid *int) (digsep int) {
   449  	if base <= 10 {
   450  		max := rune('0' + base)
   451  		for isDecimal(s.ch) || s.ch == '_' {
   452  			ds := 1
   453  			if s.ch == '_' {
   454  				ds = 2
   455  			} else if s.ch >= max && *invalid < 0 {
   456  				_, col := s.pos()
   457  				*invalid = int(col - s.col) // record invalid rune index
   458  			}
   459  			digsep |= ds
   460  			s.nextch()
   461  		}
   462  	} else {
   463  		for isHex(s.ch) || s.ch == '_' {
   464  			ds := 1
   465  			if s.ch == '_' {
   466  				ds = 2
   467  			}
   468  			digsep |= ds
   469  			s.nextch()
   470  		}
   471  	}
   472  	return
   473  }
   474  
   475  func (s *scanner) number(seenPoint bool) {
   476  	ok := true
   477  	kind := IntLit
   478  	base := 10        // number base
   479  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   480  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   481  	invalid := -1     // index of invalid digit in literal, or < 0
   482  
   483  	// integer part
   484  	if !seenPoint {
   485  		if s.ch == '0' {
   486  			s.nextch()
   487  			switch lower(s.ch) {
   488  			case 'x':
   489  				s.nextch()
   490  				base, prefix = 16, 'x'
   491  			case 'o':
   492  				s.nextch()
   493  				base, prefix = 8, 'o'
   494  			case 'b':
   495  				s.nextch()
   496  				base, prefix = 2, 'b'
   497  			default:
   498  				base, prefix = 8, '0'
   499  				digsep = 1 // leading 0
   500  			}
   501  		}
   502  		digsep |= s.digits(base, &invalid)
   503  		if s.ch == '.' {
   504  			if prefix == 'o' || prefix == 'b' {
   505  				s.errorf("invalid radix point in %s literal", baseName(base))
   506  				ok = false
   507  			}
   508  			s.nextch()
   509  			seenPoint = true
   510  		}
   511  	}
   512  
   513  	// fractional part
   514  	if seenPoint {
   515  		kind = FloatLit
   516  		digsep |= s.digits(base, &invalid)
   517  	}
   518  
   519  	if digsep&1 == 0 && ok {
   520  		s.errorf("%s literal has no digits", baseName(base))
   521  		ok = false
   522  	}
   523  
   524  	// exponent
   525  	if e := lower(s.ch); e == 'e' || e == 'p' {
   526  		if ok {
   527  			switch {
   528  			case e == 'e' && prefix != 0 && prefix != '0':
   529  				s.errorf("%q exponent requires decimal mantissa", s.ch)
   530  				ok = false
   531  			case e == 'p' && prefix != 'x':
   532  				s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
   533  				ok = false
   534  			}
   535  		}
   536  		s.nextch()
   537  		kind = FloatLit
   538  		if s.ch == '+' || s.ch == '-' {
   539  			s.nextch()
   540  		}
   541  		digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
   542  		if digsep&1 == 0 && ok {
   543  			s.errorf("exponent has no digits")
   544  			ok = false
   545  		}
   546  	} else if prefix == 'x' && kind == FloatLit && ok {
   547  		s.errorf("hexadecimal mantissa requires a 'p' exponent")
   548  		ok = false
   549  	}
   550  
   551  	// suffix 'i'
   552  	if s.ch == 'i' {
   553  		kind = ImagLit
   554  		s.nextch()
   555  	}
   556  
   557  	s.setLit(kind, ok) // do this now so we can use s.lit below
   558  
   559  	if kind == IntLit && invalid >= 0 && ok {
   560  		s.errorAtf(invalid, "invalid digit %q in %s literal", s.lit[invalid], baseName(base))
   561  		ok = false
   562  	}
   563  
   564  	if digsep&2 != 0 && ok {
   565  		if i := invalidSep(s.lit); i >= 0 {
   566  			s.errorAtf(i, "'_' must separate successive digits")
   567  			ok = false
   568  		}
   569  	}
   570  
   571  	s.bad = !ok // correct s.bad
   572  }
   573  
   574  func baseName(base int) string {
   575  	switch base {
   576  	case 2:
   577  		return "binary"
   578  	case 8:
   579  		return "octal"
   580  	case 10:
   581  		return "decimal"
   582  	case 16:
   583  		return "hexadecimal"
   584  	}
   585  	panic("invalid base")
   586  }
   587  
   588  // invalidSep returns the index of the first invalid separator in x, or -1.
   589  func invalidSep(x string) int {
   590  	x1 := ' ' // prefix char, we only care if it's 'x'
   591  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   592  	i := 0
   593  
   594  	// a prefix counts as a digit
   595  	if len(x) >= 2 && x[0] == '0' {
   596  		x1 = lower(rune(x[1]))
   597  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   598  			d = '0'
   599  			i = 2
   600  		}
   601  	}
   602  
   603  	// mantissa and exponent
   604  	for ; i < len(x); i++ {
   605  		p := d // previous digit
   606  		d = rune(x[i])
   607  		switch {
   608  		case d == '_':
   609  			if p != '0' {
   610  				return i
   611  			}
   612  		case isDecimal(d) || x1 == 'x' && isHex(d):
   613  			d = '0'
   614  		default:
   615  			if p == '_' {
   616  				return i - 1
   617  			}
   618  			d = '.'
   619  		}
   620  	}
   621  	if d == '_' {
   622  		return len(x) - 1
   623  	}
   624  
   625  	return -1
   626  }
   627  
   628  func (s *scanner) rune() {
   629  	ok := true
   630  	s.nextch()
   631  
   632  	n := 0
   633  	for ; ; n++ {
   634  		if s.ch == '\'' {
   635  			if ok {
   636  				if n == 0 {
   637  					s.errorf("empty rune literal or unescaped '")
   638  					ok = false
   639  				} else if n != 1 {
   640  					s.errorAtf(0, "more than one character in rune literal")
   641  					ok = false
   642  				}
   643  			}
   644  			s.nextch()
   645  			break
   646  		}
   647  		if s.ch == '\\' {
   648  			s.nextch()
   649  			if !s.escape('\'') {
   650  				ok = false
   651  			}
   652  			continue
   653  		}
   654  		if s.ch == '\n' {
   655  			if ok {
   656  				s.errorf("newline in rune literal")
   657  				ok = false
   658  			}
   659  			break
   660  		}
   661  		if s.ch < 0 {
   662  			if ok {
   663  				s.errorAtf(0, "rune literal not terminated")
   664  				ok = false
   665  			}
   666  			break
   667  		}
   668  		s.nextch()
   669  	}
   670  
   671  	s.setLit(RuneLit, ok)
   672  }
   673  
   674  func (s *scanner) stdString() {
   675  	ok := true
   676  	s.nextch()
   677  
   678  	for {
   679  		if s.ch == '"' {
   680  			s.nextch()
   681  			break
   682  		}
   683  		if s.ch == '\\' {
   684  			s.nextch()
   685  			if !s.escape('"') {
   686  				ok = false
   687  			}
   688  			continue
   689  		}
   690  		if s.ch == '\n' {
   691  			s.errorf("newline in string")
   692  			ok = false
   693  			break
   694  		}
   695  		if s.ch < 0 {
   696  			s.errorAtf(0, "string not terminated")
   697  			ok = false
   698  			break
   699  		}
   700  		s.nextch()
   701  	}
   702  
   703  	s.setLit(StringLit, ok)
   704  }
   705  
   706  func (s *scanner) rawString() {
   707  	ok := true
   708  	s.nextch()
   709  
   710  	for {
   711  		if s.ch == '`' {
   712  			s.nextch()
   713  			break
   714  		}
   715  		if s.ch < 0 {
   716  			s.errorAtf(0, "string not terminated")
   717  			ok = false
   718  			break
   719  		}
   720  		s.nextch()
   721  	}
   722  	// We leave CRs in the string since they are part of the
   723  	// literal (even though they are not part of the literal
   724  	// value).
   725  
   726  	s.setLit(StringLit, ok)
   727  }
   728  
   729  func (s *scanner) comment(text string) {
   730  	s.errorAtf(0, "%s", text)
   731  }
   732  
   733  func (s *scanner) skipLine() {
   734  	// don't consume '\n' - needed for nlsemi logic
   735  	for s.ch >= 0 && s.ch != '\n' {
   736  		s.nextch()
   737  	}
   738  }
   739  
   740  func (s *scanner) lineComment() {
   741  	// opening has already been consumed
   742  
   743  	if s.mode&comments != 0 {
   744  		s.skipLine()
   745  		s.comment(string(s.segment()))
   746  		return
   747  	}
   748  
   749  	// are we saving directives? or is this definitely not a directive?
   750  	if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
   751  		s.stop()
   752  		s.skipLine()
   753  		return
   754  	}
   755  
   756  	// recognize go: or line directives
   757  	prefix := "go:"
   758  	if s.ch == 'l' {
   759  		prefix = "line "
   760  	}
   761  	for _, m := range prefix {
   762  		if s.ch != m {
   763  			s.stop()
   764  			s.skipLine()
   765  			return
   766  		}
   767  		s.nextch()
   768  	}
   769  
   770  	// directive text
   771  	s.skipLine()
   772  	s.comment(string(s.segment()))
   773  }
   774  
   775  func (s *scanner) skipComment() bool {
   776  	for s.ch >= 0 {
   777  		for s.ch == '*' {
   778  			s.nextch()
   779  			if s.ch == '/' {
   780  				s.nextch()
   781  				return true
   782  			}
   783  		}
   784  		s.nextch()
   785  	}
   786  	s.errorAtf(0, "comment not terminated")
   787  	return false
   788  }
   789  
   790  func (s *scanner) fullComment() {
   791  	/* opening has already been consumed */
   792  
   793  	if s.mode&comments != 0 {
   794  		if s.skipComment() {
   795  			s.comment(string(s.segment()))
   796  		}
   797  		return
   798  	}
   799  
   800  	if s.mode&directives == 0 || s.ch != 'l' {
   801  		s.stop()
   802  		s.skipComment()
   803  		return
   804  	}
   805  
   806  	// recognize line directive
   807  	const prefix = "line "
   808  	for _, m := range prefix {
   809  		if s.ch != m {
   810  			s.stop()
   811  			s.skipComment()
   812  			return
   813  		}
   814  		s.nextch()
   815  	}
   816  
   817  	// directive text
   818  	if s.skipComment() {
   819  		s.comment(string(s.segment()))
   820  	}
   821  }
   822  
   823  func (s *scanner) escape(quote rune) bool {
   824  	var n int
   825  	var base, max uint32
   826  
   827  	switch s.ch {
   828  	case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
   829  		s.nextch()
   830  		return true
   831  	case '0', '1', '2', '3', '4', '5', '6', '7':
   832  		n, base, max = 3, 8, 255
   833  	case 'x':
   834  		s.nextch()
   835  		n, base, max = 2, 16, 255
   836  	case 'u':
   837  		s.nextch()
   838  		n, base, max = 4, 16, unicode.MaxRune
   839  	case 'U':
   840  		s.nextch()
   841  		n, base, max = 8, 16, unicode.MaxRune
   842  	default:
   843  		if s.ch < 0 {
   844  			return true // complain in caller about EOF
   845  		}
   846  		s.errorf("unknown escape")
   847  		return false
   848  	}
   849  
   850  	var x uint32
   851  	for i := n; i > 0; i-- {
   852  		if s.ch < 0 {
   853  			return true // complain in caller about EOF
   854  		}
   855  		d := base
   856  		if isDecimal(s.ch) {
   857  			d = uint32(s.ch) - '0'
   858  		} else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
   859  			d = uint32(lower(s.ch)) - 'a' + 10
   860  		}
   861  		if d >= base {
   862  			s.errorf("invalid character %q in %s escape", s.ch, baseName(int(base)))
   863  			return false
   864  		}
   865  		// d < base
   866  		x = x*base + d
   867  		s.nextch()
   868  	}
   869  
   870  	if x > max && base == 8 {
   871  		s.errorf("octal escape value %d > 255", x)
   872  		return false
   873  	}
   874  
   875  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   876  		s.errorf("escape is invalid Unicode code point %#U", x)
   877  		return false
   878  	}
   879  
   880  	return true
   881  }
   882  

View as plain text