link.go

Documentation: cmd/vendor/rsc.io/markdown

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package markdown
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"strings"
    11  	"unicode/utf8"
    12  
    13  	"golang.org/x/text/cases"
    14  )
    15  
    16  func parseLinkRefDef(p buildState, s string) (int, bool) {
    17  	// “A link reference definition consists of a link label,
    18  	// optionally preceded by up to three spaces of indentation,
    19  	// followed by a colon (:),
    20  	// optional spaces or tabs (including up to one line ending),
    21  	// a link destination,
    22  	// optional spaces or tabs (including up to one line ending),
    23  	// and an optional link title,
    24  	// which if it is present must be separated from the link destination
    25  	// by spaces or tabs. No further character may occur.”
    26  	i := skipSpace(s, 0)
    27  	label, i, ok := parseLinkLabel(p.(*parseState), s, i)
    28  	if !ok || i >= len(s) || s[i] != ':' {
    29  		return 0, false
    30  	}
    31  	i = skipSpace(s, i+1)
    32  	suf := s[i:]
    33  	dest, i, ok := parseLinkDest(s, i)
    34  	if !ok {
    35  		if suf != "" && suf[0] == '<' {
    36  			// Goldmark treats <<> as a link definition.
    37  			p.(*parseState).corner = true
    38  		}
    39  		return 0, false
    40  	}
    41  	moved := false
    42  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
    43  		moved = true
    44  		i++
    45  	}
    46  
    47  	// Take title if present and doesn't break parse.
    48  	j := i
    49  	if j >= len(s) || s[j] == '\n' {
    50  		moved = true
    51  		if j < len(s) {
    52  			j++
    53  		}
    54  	}
    55  
    56  	var title string
    57  	var titleChar byte
    58  	var corner bool
    59  	if moved {
    60  		for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
    61  			j++
    62  		}
    63  		if t, c, j, ok := parseLinkTitle(s, j); ok {
    64  			for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
    65  				j++
    66  			}
    67  			if j >= len(s) || s[j] == '\n' {
    68  				i = j
    69  				if t == "" {
    70  					// Goldmark adds title="" in this case.
    71  					// We do not, nor does the Dingus.
    72  					corner = true
    73  				}
    74  				title = t
    75  				titleChar = c
    76  			}
    77  		}
    78  	}
    79  
    80  	// Must end line. Already trimmed spaces.
    81  	if i < len(s) && s[i] != '\n' {
    82  		return 0, false
    83  	}
    84  	if i < len(s) {
    85  		i++
    86  	}
    87  
    88  	label = normalizeLabel(label)
    89  	if p.link(label) == nil {
    90  		p.defineLink(label, &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner})
    91  	}
    92  	return i, true
    93  }
    94  
    95  func parseLinkTitle(s string, i int) (title string, char byte, next int, found bool) {
    96  	if i < len(s) && (s[i] == '"' || s[i] == '\'' || s[i] == '(') {
    97  		want := s[i]
    98  		if want == '(' {
    99  			want = ')'
   100  		}
   101  		j := i + 1
   102  		for ; j < len(s); j++ {
   103  			if s[j] == want {
   104  				title := s[i+1 : j]
   105  				// TODO: Validate title?
   106  				return mdUnescaper.Replace(title), want, j + 1, true
   107  			}
   108  			if s[j] == '(' && want == ')' {
   109  				break
   110  			}
   111  			if s[j] == '\\' && j+1 < len(s) {
   112  				j++
   113  			}
   114  		}
   115  	}
   116  	return "", 0, 0, false
   117  }
   118  
   119  func parseLinkLabel(p *parseState, s string, i int) (string, int, bool) {
   120  	// “A link label begins with a left bracket ([) and ends with
   121  	// the first right bracket (]) that is not backslash-escaped.
   122  	// Between these brackets there must be at least one character
   123  	// that is not a space, tab, or line ending.
   124  	// Unescaped square bracket characters are not allowed
   125  	// inside the opening and closing square brackets of link labels.
   126  	// A link label can have at most 999 characters inside the square brackets.”
   127  	if i >= len(s) || s[i] != '[' {
   128  		return "", 0, false
   129  	}
   130  	j := i + 1
   131  	for ; j < len(s); j++ {
   132  		if s[j] == ']' {
   133  			if j-(i+1) > 999 {
   134  				// Goldmark does not apply 999 limit.
   135  				p.corner = true
   136  				break
   137  			}
   138  			if label := trimSpaceTabNewline(s[i+1 : j]); label != "" {
   139  				// Note: CommonMark Dingus does not escape.
   140  				return label, j + 1, true
   141  			}
   142  			break
   143  		}
   144  		if s[j] == '[' {
   145  			break
   146  		}
   147  		if s[j] == '\\' && j+1 < len(s) {
   148  			j++
   149  		}
   150  	}
   151  	return "", 0, false
   152  }
   153  
   154  func normalizeLabel(s string) string {
   155  	if strings.Contains(s, "[") || strings.Contains(s, "]") {
   156  		// Labels cannot have [ ] so avoid the work of translating.
   157  		// This is especially important for pathlogical cases like
   158  		// [[[[[[[[[[a]]]]]]]]]] which would otherwise generate quadratic
   159  		// amounts of garbage.
   160  		return ""
   161  	}
   162  
   163  	// “To normalize a label, strip off the opening and closing brackets,
   164  	// perform the Unicode case fold, strip leading and trailing spaces, tabs, and line endings,
   165  	// and collapse consecutive internal spaces, tabs, and line endings to a single space.”
   166  	s = trimSpaceTabNewline(s)
   167  	var b strings.Builder
   168  	space := false
   169  	hi := false
   170  	for i := 0; i < len(s); i++ {
   171  		c := s[i]
   172  		switch c {
   173  		case ' ', '\t', '\n':
   174  			space = true
   175  			continue
   176  		default:
   177  			if space {
   178  				b.WriteByte(' ')
   179  				space = false
   180  			}
   181  			if 'A' <= c && c <= 'Z' {
   182  				c += 'a' - 'A'
   183  			}
   184  			if c >= 0x80 {
   185  				hi = true
   186  			}
   187  			b.WriteByte(c)
   188  		}
   189  	}
   190  	s = b.String()
   191  	if hi {
   192  		s = cases.Fold().String(s)
   193  	}
   194  	return s
   195  }
   196  
   197  func parseLinkDest(s string, i int) (string, int, bool) {
   198  	if i >= len(s) {
   199  		return "", 0, false
   200  	}
   201  
   202  	// “A sequence of zero or more characters between an opening < and a closing >
   203  	// that contains no line endings or unescaped < or > characters,”
   204  	if s[i] == '<' {
   205  		for j := i + 1; ; j++ {
   206  			if j >= len(s) || s[j] == '\n' || s[j] == '<' {
   207  				return "", 0, false
   208  			}
   209  			if s[j] == '>' {
   210  				// TODO unescape?
   211  				return mdUnescape(s[i+1 : j]), j + 1, true
   212  			}
   213  			if s[j] == '\\' {
   214  				j++
   215  			}
   216  		}
   217  	}
   218  
   219  	// “or a nonempty sequence of characters that does not start with <,
   220  	// does not include ASCII control characters or space character,
   221  	// and includes parentheses only if (a) they are backslash-escaped
   222  	// or (b) they are part of a balanced pair of unescaped parentheses.
   223  	depth := 0
   224  	j := i
   225  Loop:
   226  	for ; j < len(s); j++ {
   227  		switch s[j] {
   228  		case '(':
   229  			depth++
   230  			if depth > 32 {
   231  				// Avoid quadratic inputs by stopping if too deep.
   232  				// This is the same depth that cmark-gfm uses.
   233  				return "", 0, false
   234  			}
   235  		case ')':
   236  			if depth == 0 {
   237  				break Loop
   238  			}
   239  			depth--
   240  		case '\\':
   241  			if j+1 < len(s) {
   242  				if s[j+1] == ' ' || s[j+1] == '\t' {
   243  					return "", 0, false
   244  				}
   245  				j++
   246  			}
   247  		case ' ', '\t', '\n':
   248  			break Loop
   249  		}
   250  	}
   251  
   252  	dest := s[i:j]
   253  	// TODO: Validate dest?
   254  	// TODO: Unescape?
   255  	// NOTE: CommonMark Dingus does not reject control characters.
   256  	return mdUnescape(dest), j, true
   257  }
   258  
   259  func parseAutoLinkURI(s string, i int) (Inline, int, bool) {
   260  	// CommonMark 0.30:
   261  	//
   262  	//	For purposes of this spec, a scheme is any sequence of 2–32 characters
   263  	//	beginning with an ASCII letter and followed by any combination of
   264  	//	ASCII letters, digits, or the symbols plus (”+”), period (”.”), or
   265  	//	hyphen (”-”).
   266  	//
   267  	//	An absolute URI, for these purposes, consists of a scheme followed by
   268  	//	a colon (:) followed by zero or more characters other ASCII control
   269  	//	characters, space, <, and >. If the URI includes these characters,
   270  	//	they must be percent-encoded (e.g. %20 for a space).
   271  
   272  	j := i
   273  	if j+1 >= len(s) || s[j] != '<' || !isLetter(s[j+1]) {
   274  		return nil, 0, false
   275  	}
   276  	j++
   277  	for j < len(s) && isScheme(s[j]) && j-(i+1) <= 32 {
   278  		j++
   279  	}
   280  	if j-(i+1) < 2 || j-(i+1) > 32 || j >= len(s) || s[j] != ':' {
   281  		return nil, 0, false
   282  	}
   283  	j++
   284  	for j < len(s) && isURL(s[j]) {
   285  		j++
   286  	}
   287  	if j >= len(s) || s[j] != '>' {
   288  		return nil, 0, false
   289  	}
   290  	link := s[i+1 : j]
   291  	// link = mdUnescaper.Replace(link)
   292  	return &AutoLink{link, link}, j + 1, true
   293  }
   294  
   295  func parseAutoLinkEmail(s string, i int) (Inline, int, bool) {
   296  	// CommonMark 0.30:
   297  	//
   298  	//	An email address, for these purposes, is anything that matches
   299  	//	the non-normative regex from the HTML5 spec:
   300  	//
   301  	//	/^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
   302  
   303  	j := i
   304  	if j+1 >= len(s) || s[j] != '<' || !isUser(s[j+1]) {
   305  		return nil, 0, false
   306  	}
   307  	j++
   308  	for j < len(s) && isUser(s[j]) {
   309  		j++
   310  	}
   311  	if j >= len(s) || s[j] != '@' {
   312  		return nil, 0, false
   313  	}
   314  	for {
   315  		j++
   316  		n, ok := skipDomainElem(s[j:])
   317  		if !ok {
   318  			return nil, 0, false
   319  		}
   320  		j += n
   321  		if j >= len(s) || s[j] != '.' && s[j] != '>' {
   322  			return nil, 0, false
   323  		}
   324  		if s[j] == '>' {
   325  			break
   326  		}
   327  	}
   328  	email := s[i+1 : j]
   329  	return &AutoLink{email, "mailto:" + email}, j + 1, true
   330  }
   331  
   332  func isUser(c byte) bool {
   333  	if isLetterDigit(c) {
   334  		return true
   335  	}
   336  	s := ".!#$%&'*+/=?^_`{|}~-"
   337  	for i := 0; i < len(s); i++ {
   338  		if c == s[i] {
   339  			return true
   340  		}
   341  	}
   342  	return false
   343  }
   344  
   345  func isHexDigit(c byte) bool {
   346  	return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9'
   347  }
   348  
   349  func isDigit(c byte) bool {
   350  	return '0' <= c && c <= '9'
   351  }
   352  
   353  func skipDomainElem(s string) (int, bool) {
   354  	// String of LDH, up to 63 in length, with LetterDigit
   355  	// at both ends (1-letter/digit names are OK).
   356  	// Aka /[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?/.
   357  	if len(s) < 1 || !isLetterDigit(s[0]) {
   358  		return 0, false
   359  	}
   360  	i := 1
   361  	for i < len(s) && isLDH(s[i]) && i <= 63 {
   362  		i++
   363  	}
   364  	if i > 63 || !isLetterDigit(s[i-1]) {
   365  		return 0, false
   366  	}
   367  	return i, true
   368  }
   369  
   370  func isScheme(c byte) bool {
   371  	return isLetterDigit(c) || c == '+' || c == '.' || c == '-'
   372  }
   373  
   374  func isURL(c byte) bool {
   375  	return c > ' ' && c != '<' && c != '>'
   376  }
   377  
   378  type AutoLink struct {
   379  	Text string
   380  	URL  string
   381  }
   382  
   383  func (*AutoLink) Inline() {}
   384  
   385  func (x *AutoLink) PrintHTML(buf *bytes.Buffer) {
   386  	fmt.Fprintf(buf, "<a href=\"%s\">%s</a>", htmlLinkEscaper.Replace(x.URL), htmlEscaper.Replace(x.Text))
   387  }
   388  
   389  func (x *AutoLink) printMarkdown(buf *bytes.Buffer) {
   390  	fmt.Fprintf(buf, "<%s>", x.Text)
   391  }
   392  
   393  func (x *AutoLink) PrintText(buf *bytes.Buffer) {
   394  	fmt.Fprintf(buf, "%s", htmlEscaper.Replace(x.Text))
   395  }
   396  
   397  type Link struct {
   398  	Inner     []Inline
   399  	URL       string
   400  	Title     string
   401  	TitleChar byte // ', " or )
   402  	corner    bool
   403  }
   404  
   405  func (*Link) Inline() {}
   406  
   407  func (x *Link) PrintHTML(buf *bytes.Buffer) {
   408  	fmt.Fprintf(buf, "<a href=\"%s\"", htmlLinkEscaper.Replace(x.URL))
   409  	if x.Title != "" {
   410  		fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title))
   411  	}
   412  	buf.WriteString(">")
   413  	for _, c := range x.Inner {
   414  		c.PrintHTML(buf)
   415  	}
   416  	buf.WriteString("</a>")
   417  }
   418  
   419  func (x *Link) printMarkdown(buf *bytes.Buffer) {
   420  	buf.WriteByte('[')
   421  	x.printRemainingMarkdown(buf)
   422  }
   423  
   424  func (x *Link) printRemainingMarkdown(buf *bytes.Buffer) {
   425  	for _, c := range x.Inner {
   426  		c.printMarkdown(buf)
   427  	}
   428  	buf.WriteString("](")
   429  	buf.WriteString(x.URL)
   430  	printLinkTitleMarkdown(buf, x.Title, x.TitleChar)
   431  	buf.WriteByte(')')
   432  }
   433  
   434  func printLinkTitleMarkdown(buf *bytes.Buffer, title string, titleChar byte) {
   435  	if title == "" {
   436  		return
   437  	}
   438  	closeChar := titleChar
   439  	openChar := closeChar
   440  	if openChar == ')' {
   441  		openChar = '('
   442  	}
   443  	fmt.Fprintf(buf, " %c%s%c", openChar, title /*TODO(jba): escape*/, closeChar)
   444  }
   445  
   446  func (x *Link) PrintText(buf *bytes.Buffer) {
   447  	for _, c := range x.Inner {
   448  		c.PrintText(buf)
   449  	}
   450  }
   451  
   452  type Image struct {
   453  	Inner     []Inline
   454  	URL       string
   455  	Title     string
   456  	TitleChar byte
   457  	corner    bool
   458  }
   459  
   460  func (*Image) Inline() {}
   461  
   462  func (x *Image) PrintHTML(buf *bytes.Buffer) {
   463  	fmt.Fprintf(buf, "<img src=\"%s\"", htmlLinkEscaper.Replace(x.URL))
   464  	fmt.Fprintf(buf, " alt=\"")
   465  	i := buf.Len()
   466  	for _, c := range x.Inner {
   467  		c.PrintText(buf)
   468  	}
   469  	// GitHub and Goldmark both rewrite \n to space
   470  	// but the Dingus does not.
   471  	// The spec says title can be split across lines but not
   472  	// what happens at that point.
   473  	out := buf.Bytes()
   474  	for ; i < len(out); i++ {
   475  		if out[i] == '\n' {
   476  			out[i] = ' '
   477  		}
   478  	}
   479  	fmt.Fprintf(buf, "\"")
   480  	if x.Title != "" {
   481  		fmt.Fprintf(buf, " title=\"%s\"", htmlQuoteEscaper.Replace(x.Title))
   482  	}
   483  	buf.WriteString(" />")
   484  }
   485  
   486  func (x *Image) printMarkdown(buf *bytes.Buffer) {
   487  	buf.WriteString("![")
   488  	(*Link)(x).printRemainingMarkdown(buf)
   489  }
   490  
   491  func (x *Image) PrintText(buf *bytes.Buffer) {
   492  	for _, c := range x.Inner {
   493  		c.PrintText(buf)
   494  	}
   495  }
   496  
   497  // GitHub Flavored Markdown autolinks extension
   498  // https://github.github.com/gfm/#autolinks-extension-
   499  
   500  // autoLinkMore rewrites any extended autolinks in the body
   501  // and returns the result.
   502  //
   503  // body is a list of Plain, Emph, Strong, and Del nodes.
   504  // Two Plains only appear consecutively when one is a
   505  // potential emphasis marker that ended up being plain after all, like "_" or "**".
   506  // There are no Link nodes.
   507  //
   508  // The GitHub “spec” declares that “autolinks can only come at the
   509  // beginning of a line, after whitespace, or any of the delimiting
   510  // characters *, _, ~, and (”. However, the GitHub web site does not
   511  // enforce this rule: text like "$abc@def.ghi is my email" links the
   512  // text following the $ as an email address. It appears the actual rule
   513  // is that autolinks cannot come after ASCII letters, although they can
   514  // come after numbers or Unicode letters.
   515  // Since the only point of implementing GitHub Flavored Markdown
   516  // is to match GitHub's behavior, we do what they do, not what they say,
   517  // at least for now.
   518  func (p *parseState) autoLinkText(list []Inline) []Inline {
   519  	if !p.AutoLinkText {
   520  		return list
   521  	}
   522  
   523  	var out []Inline // allocated lazily when we first change list
   524  	for i, x := range list {
   525  		switch x := x.(type) {
   526  		case *Plain:
   527  			if rewrite := p.autoLinkPlain(x.Text); rewrite != nil {
   528  				if out == nil {
   529  					out = append(out, list[:i]...)
   530  				}
   531  				out = append(out, rewrite...)
   532  				continue
   533  			}
   534  		case *Strong:
   535  			x.Inner = p.autoLinkText(x.Inner)
   536  		case *Del:
   537  			x.Inner = p.autoLinkText(x.Inner)
   538  		case *Emph:
   539  			x.Inner = p.autoLinkText(x.Inner)
   540  		}
   541  		if out != nil {
   542  			out = append(out, x)
   543  		}
   544  	}
   545  	if out == nil {
   546  		return list
   547  	}
   548  	return out
   549  }
   550  
   551  func (p *parseState) autoLinkPlain(s string) []Inline {
   552  	vd := &validDomainChecker{s: s}
   553  	var out []Inline
   554  Restart:
   555  	for i := 0; i < len(s); i++ {
   556  		c := s[i]
   557  		if c == '@' {
   558  			if before, link, after, ok := p.parseAutoEmail(s, i); ok {
   559  				if before != "" {
   560  					out = append(out, &Plain{Text: before})
   561  				}
   562  				out = append(out, link)
   563  				vd.skip(len(s) - len(after))
   564  				s = after
   565  				goto Restart
   566  			}
   567  		}
   568  
   569  		if (c == 'h' || c == 'm' || c == 'x' || c == 'w') && (i == 0 || !isLetter(s[i-1])) {
   570  			if link, after, ok := p.parseAutoProto(s, i, vd); ok {
   571  				if i > 0 {
   572  					out = append(out, &Plain{Text: s[:i]})
   573  				}
   574  				out = append(out, link)
   575  				vd.skip(len(s) - len(after))
   576  				s = after
   577  				goto Restart
   578  			}
   579  		}
   580  	}
   581  	if out == nil {
   582  		return nil
   583  	}
   584  	out = append(out, &Plain{Text: s})
   585  	return out
   586  }
   587  
   588  func (p *parseState) parseAutoProto(s string, i int, vd *validDomainChecker) (link *Link, after string, found bool) {
   589  	if s == "" {
   590  		return
   591  	}
   592  	switch s[i] {
   593  	case 'h':
   594  		var n int
   595  		if strings.HasPrefix(s[i:], "https://") {
   596  			n = len("https://")
   597  		} else if strings.HasPrefix(s[i:], "http://") {
   598  			n = len("http://")
   599  		} else {
   600  			return
   601  		}
   602  		return p.parseAutoHTTP(s[i:i+n], s, i, i+n, i+n+1, vd)
   603  	case 'w':
   604  		if !strings.HasPrefix(s[i:], "www.") {
   605  			return
   606  		}
   607  		// GitHub Flavored Markdown says to use http://,
   608  		// but it's not 1985 anymore. We live in the https:// future
   609  		// (unless the parser is explicitly configured otherwise).
   610  		// People who really care in their docs can write http:// themselves.
   611  		scheme := "https://"
   612  		if p.AutoLinkAssumeHTTP {
   613  			scheme = "http://"
   614  		}
   615  		return p.parseAutoHTTP(scheme, s, i, i, i+3, vd)
   616  	case 'm':
   617  		if !strings.HasPrefix(s[i:], "mailto:") {
   618  			return
   619  		}
   620  		return p.parseAutoMailto(s, i)
   621  	case 'x':
   622  		if !strings.HasPrefix(s[i:], "xmpp:") {
   623  			return
   624  		}
   625  		return p.parseAutoXmpp(s, i)
   626  	}
   627  	return
   628  }
   629  
   630  // parseAutoWWW parses an extended www autolink.
   631  // https://github.github.com/gfm/#extended-www-autolink
   632  func (p *parseState) parseAutoHTTP(scheme, s string, textstart, start, min int, vd *validDomainChecker) (link *Link, after string, found bool) {
   633  	n, ok := vd.parseValidDomain(start)
   634  	if !ok {
   635  		return
   636  	}
   637  	i := start + n
   638  	domEnd := i
   639  
   640  	// “After a valid domain, zero or more non-space non-< characters may follow.”
   641  	paren := 0
   642  	for i < len(s) {
   643  		r, n := utf8.DecodeRuneInString(s[i:])
   644  		if isUnicodeSpace(r) || r == '<' {
   645  			break
   646  		}
   647  		if r == '(' {
   648  			paren++
   649  		}
   650  		if r == ')' {
   651  			paren--
   652  		}
   653  		i += n
   654  	}
   655  
   656  	// https://github.github.com/gfm/#extended-autolink-path-validation
   657  Trim:
   658  	for i > min {
   659  		switch s[i-1] {
   660  		case '?', '!', '.', ',', ':', '@', '_', '~':
   661  			// Trim certain trailing punctuation.
   662  			i--
   663  			continue Trim
   664  
   665  		case ')':
   666  			// Trim trailing unmatched (by count only) parens.
   667  			if paren < 0 {
   668  				for s[i-1] == ')' && paren < 0 {
   669  					paren++
   670  					i--
   671  				}
   672  				continue Trim
   673  			}
   674  
   675  		case ';':
   676  			// Trim entity reference.
   677  			// After doing the work of the scan, we either cut that part off the string
   678  			// or we stop the trimming entirely, so there's no chance of repeating
   679  			// the scan on a future iteration and going accidentally quadratic.
   680  			// Even though the Markdown spec already requires having a complete
   681  			// list of all the HTML entities, the GitHub definition here just requires
   682  			// "looks like" an entity, meaning its an ampersand, letters/digits, and semicolon.
   683  			for j := i - 2; j > start; j-- {
   684  				if j < i-2 && s[j] == '&' {
   685  					i = j
   686  					continue Trim
   687  				}
   688  				if !isLetterDigit(s[j]) {
   689  					break Trim
   690  				}
   691  			}
   692  		}
   693  		break Trim
   694  	}
   695  
   696  	// According to the literal text of the GitHub Flavored Markdown spec
   697  	// and the actual behavior on GitHub,
   698  	// www.example.com$foo turns into <a href="https://www.example.com$foo">,
   699  	// but that makes the character restrictions in the valid-domain check
   700  	// almost meaningless. So we insist that when all is said and done,
   701  	// if the domain is followed by anything, that thing must be a slash,
   702  	// even though GitHub is not that picky.
   703  	// People might complain about www.example.com:1234 not working,
   704  	// but if you want to get fancy with that kind of thing, just write http:// in front.
   705  	if textstart == start && i > domEnd && s[domEnd] != '/' {
   706  		i = domEnd
   707  	}
   708  
   709  	if i < min {
   710  		return
   711  	}
   712  
   713  	link = &Link{
   714  		Inner: []Inline{&Plain{Text: s[textstart:i]}},
   715  		URL:   scheme + s[start:i],
   716  	}
   717  	return link, s[i:], true
   718  }
   719  
   720  type validDomainChecker struct {
   721  	s   string
   722  	cut int // before this index, no valid domains
   723  }
   724  
   725  func (v *validDomainChecker) skip(i int) {
   726  	v.s = v.s[i:]
   727  	v.cut -= i
   728  }
   729  
   730  // parseValidDomain parses a valid domain.
   731  // https://github.github.com/gfm/#valid-domain
   732  //
   733  // If s starts with a valid domain, parseValidDomain returns
   734  // the length of that domain and true. If s does not start with
   735  // a valid domain, parseValidDomain returns n, false,
   736  // where n is the length of a prefix guaranteed not to be acceptable
   737  // to any future call to parseValidDomain.
   738  //
   739  // “A valid domain consists of segments of alphanumeric characters,
   740  // underscores (_) and hyphens (-) separated by periods (.).
   741  // There must be at least one period, and no underscores may be
   742  // present in the last two segments of the domain.”
   743  //
   744  // The spec does not spell out whether segments can be empty.
   745  // Empirically, in GitHub's implementation they can.
   746  func (v *validDomainChecker) parseValidDomain(start int) (n int, found bool) {
   747  	if start < v.cut {
   748  		return 0, false
   749  	}
   750  	i := start
   751  	dots := 0
   752  	for ; i < len(v.s); i++ {
   753  		c := v.s[i]
   754  		if c == '_' {
   755  			dots = -2
   756  			continue
   757  		}
   758  		if c == '.' {
   759  			dots++
   760  			continue
   761  		}
   762  		if !isLDH(c) {
   763  			break
   764  		}
   765  	}
   766  	if dots >= 0 && i > start {
   767  		return i - start, true
   768  	}
   769  	v.cut = i
   770  	return 0, false
   771  }
   772  
   773  func (p *parseState) parseAutoEmail(s string, i int) (before string, link *Link, after string, ok bool) {
   774  	if s[i] != '@' {
   775  		return
   776  	}
   777  
   778  	// “One ore more characters which are alphanumeric, or ., -, _, or +.”
   779  	j := i
   780  	for j > 0 && (isLDH(s[j-1]) || s[j-1] == '_' || s[j-1] == '+' || s[j-1] == '.') {
   781  		j--
   782  	}
   783  	if i-j < 1 {
   784  		return
   785  	}
   786  
   787  	// “One or more characters which are alphanumeric, or - or _, separated by periods (.).
   788  	// There must be at least one period. The last character must not be one of - or _.”
   789  	dots := 0
   790  	k := i + 1
   791  	for k < len(s) && (isLDH(s[k]) || s[k] == '_' || s[k] == '.') {
   792  		if s[k] == '.' {
   793  			if s[k-1] == '.' {
   794  				// Empirically, .. stops the scan but foo@.bar is fine.
   795  				break
   796  			}
   797  			dots++
   798  		}
   799  		k++
   800  	}
   801  
   802  	// “., -, and _ can occur on both sides of the @, but only . may occur at the end
   803  	// of the email address, in which case it will not be considered part of the address”
   804  	if s[k-1] == '.' {
   805  		dots--
   806  		k--
   807  	}
   808  	if s[k-1] == '-' || s[k-1] == '_' {
   809  		return
   810  	}
   811  	if k-(i+1)-dots < 2 || dots < 1 {
   812  		return
   813  	}
   814  
   815  	link = &Link{
   816  		Inner: []Inline{&Plain{Text: s[j:k]}},
   817  		URL:   "mailto:" + s[j:k],
   818  	}
   819  	return s[:j], link, s[k:], true
   820  }
   821  
   822  func (p *parseState) parseAutoMailto(s string, i int) (link *Link, after string, ok bool) {
   823  	j := i + len("mailto:")
   824  	for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') {
   825  		j++
   826  	}
   827  	if j >= len(s) || s[j] != '@' {
   828  		return
   829  	}
   830  	before, link, after, ok := p.parseAutoEmail(s[i:], j-i)
   831  	if before != "mailto:" || !ok {
   832  		return nil, "", false
   833  	}
   834  	link.Inner[0] = &Plain{Text: s[i : len(s)-len(after)]}
   835  	return link, after, true
   836  }
   837  
   838  func (p *parseState) parseAutoXmpp(s string, i int) (link *Link, after string, ok bool) {
   839  	j := i + len("xmpp:")
   840  	for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') {
   841  		j++
   842  	}
   843  	if j >= len(s) || s[j] != '@' {
   844  		return
   845  	}
   846  	before, link, after, ok := p.parseAutoEmail(s[i:], j-i)
   847  	if before != "xmpp:" || !ok {
   848  		return nil, "", false
   849  	}
   850  	if after != "" && after[0] == '/' {
   851  		k := 1
   852  		for k < len(after) && (isLetterDigit(after[k]) || after[k] == '@' || after[k] == '.') {
   853  			k++
   854  		}
   855  		after = after[k:]
   856  	}
   857  	url := s[i : len(s)-len(after)]
   858  	link.Inner[0] = &Plain{Text: url}
   859  	link.URL = url
   860  	return link, after, true
   861  }
   862
View as plain text