# The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt # matches against the beginning of the source code. When a match is found, # a token is produced, we consume the match, and start again. Tokens are in the # form: # # [tag, value, lineNumber] # # Which is a format that can be fed directly into [Jison](http://github.com/zaach/jison). {Rewriter, INVERSES} = require './rewriter' # Import the helpers we need. {count, starts, compact, last} = require './helpers' # The Lexer Class # --------------- # The Lexer class reads a stream of CoffeeScript and divvies it up into tagged # tokens. Some potential ambiguity in the grammar has been avoided by # pushing some extra smarts into the Lexer. exports.Lexer = class Lexer bumpLine: (label, delta) -> @line += delta # **tokenize** is the Lexer's main method. Scan by attempting to match tokens # one at a time, using a regular expression anchored at the start of the # remaining code, or a custom recursive token-matching method # (for interpolations). When the next token has been recorded, we move forward # within the code past the token, and begin again. # # Each tokenizing method is responsible for returning the number of characters # it has consumed. # # Before returning the token stream, run it through the [Rewriter](rewriter.html) # unless explicitly asked not to. tokenize: (code, opts = {}) -> code = "\n#{code}" if WHITESPACE.test code code = code.replace(/\r/g, '').replace TRAILING_SPACES, '' @code = code # The remainder of the source code. @line = opts.line or 0 # The current line. @startLine = @line @indent = 0 # The current indentation level. @indebt = 0 # The over-indentation at the current level. @outdebt = 0 # The under-outdentation at the current level. @indents = [] # The stack of all current indentation levels. @ends = [] # The stack for pairing up tokens. @tokens = [] # Stream of parsed tokens in the form `['TYPE', value, line]`. # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. Their order determines precedence: # `@literalToken` is the fallback catch-all. i = 0 while @chunk = code[i..] lexedCode = code[...i] @tokenLine = count lexedCode, '\n' @tokenColumn = lexedCode.length - lexedCode.lastIndexOf '\n' i += @identifierToken() or @commentToken() or @whitespaceToken() or @lineToken() or @heredocToken() or @stringToken() or @numberToken() or @regexToken() or @jsToken() or @literalToken() @closeIndentation() @error "missing #{tag}" if tag = @ends.pop() return @tokens if opts.rewrite is off (new Rewriter).rewrite @tokens # Tokenizers # ---------- # Matches identifying literals: variables, keywords, method names, etc. # Check to ensure that JavaScript reserved words aren't being used as # identifiers. Because CoffeeScript reserves a handful of keywords that are # allowed in JavaScript, we're careful not to tag them as keywords when # referenced as property names here, so you can still do `jQuery.is()` even # though `is` means `===` otherwise. identifierToken: -> return 0 unless match = IDENTIFIER.exec @chunk [input, id, colon] = match if id is 'own' and @tag() is 'FOR' @token 'OWN', id return id.length forcedIdentifier = colon or (prev = last @tokens) and (prev[0] in ['.', '?.', '::'] or not prev.spaced and prev[0] is '@') tag = 'IDENTIFIER' if not forcedIdentifier and (id in JS_KEYWORDS or id in COFFEE_KEYWORDS) tag = id.toUpperCase() if tag is 'WHEN' and @tag() in LINE_BREAK tag = 'LEADING_WHEN' else if tag is 'FOR' @seenFor = yes else if tag is 'UNLESS' tag = 'IF' else if tag in UNARY tag = 'UNARY' else if tag in RELATION if tag isnt 'INSTANCEOF' and @seenFor tag = 'FOR' + tag @seenFor = no else tag = 'RELATION' if @value() is '!' @tokens.pop() id = '!' + id if id in ['eval', 'arguments'].concat JS_FORBIDDEN if forcedIdentifier tag = 'IDENTIFIER' id = new String id id.reserved = yes else if id in RESERVED @error "reserved word \"#{id}\"" unless forcedIdentifier id = COFFEE_ALIAS_MAP[id] if id in COFFEE_ALIASES tag = switch id when '!' then 'UNARY' when '==', '!=' then 'COMPARE' when '&&', '||' then 'LOGIC' when 'true', 'false', 'null', 'undefined' then 'BOOL' when 'break', 'continue' then 'STATEMENT' else tag @token tag, id @token ':', ':' if colon input.length # Matches numbers, including decimals, hex, and exponential notation. # Be careful not to interfere with ranges-in-progress. numberToken: -> return 0 unless match = NUMBER.exec @chunk number = match[0] lexedLength = number.length if binaryLiteral = /0b([01]+)/.exec number number = (parseInt binaryLiteral[1], 2).toString() @token 'NUMBER', number lexedLength # Matches strings, including multi-line strings. Ensures that quotation marks # are balanced within the string's contents, and within nested interpolations. stringToken: -> switch @chunk.charAt 0 when "'" return 0 unless match = SIMPLESTR.exec @chunk @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n' when '"' return 0 unless string = @balancedString @chunk, '"' if 0 < string.indexOf '#{', 1 @interpolateString string[1...-1] else @token 'STRING', @escapeLines string else return 0 @bumpLine "stringToken", count string, '\n' string.length # Matches heredocs, adjusting indentation to the correct level, as heredocs # preserve whitespace, but ignore indentation to the left. heredocToken: -> return 0 unless match = HEREDOC.exec @chunk heredoc = match[0] quote = heredoc.charAt 0 doc = @sanitizeHeredoc match[2], quote: quote, indent: null if quote is '"' and 0 <= doc.indexOf '#{' @interpolateString doc, heredoc: yes else @token 'STRING', @makeString doc, quote, yes @bumpLine "heredocToken", count heredoc, '\n' heredoc.length # Matches and consumes comments. commentToken: -> return 0 unless match = @chunk.match COMMENT [comment, here] = match if here @token 'HERECOMMENT', @sanitizeHeredoc here, herecomment: true, indent: Array(@indent + 1).join(' ') @token 'TERMINATOR', '\n' @bumpLine "commentToken", count comment, '\n' comment.length # Matches JavaScript interpolated directly into the source via backticks. jsToken: -> return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk @token 'JS', (script = match[0])[1...-1] script.length # Matches regular expression literals. Lexing regular expressions is difficult # to distinguish from division, so we borrow some basic heuristics from # JavaScript and Ruby. regexToken: -> return 0 if @chunk.charAt(0) isnt '/' if match = HEREGEX.exec @chunk length = @heregexToken match @bumpLine "regexToken", count match[0], '\n' return length prev = last @tokens return 0 if prev and (prev[0] in (if prev.spaced then NOT_REGEX else NOT_SPACED_REGEX)) return 0 unless match = REGEX.exec @chunk [match, regex, flags] = match if regex[..1] is '/*' then @error 'regular expressions cannot begin with `*`' if regex is '//' then regex = '/(?:)/' @token 'REGEX', "#{regex}#{flags}" match.length # Matches multiline extended regular expressions. heregexToken: (match) -> [heregex, body, flags] = match if 0 > body.indexOf '#{' re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/') if re.match /^\*/ then @error 'regular expressions cannot begin with `*`' @token 'REGEX', "/#{ re or '(?:)' }/#{flags}" return heregex.length @token 'IDENTIFIER', 'RegExp' @tokens.push ['CALL_START', '('] tokens = [] for [tag, value] in @interpolateString(body, regex: yes) if tag is 'TOKENS' tokens.push value... else continue unless value = value.replace HEREGEX_OMIT, '' value = value.replace /\\/g, '\\\\' tokens.push ['STRING', @makeString(value, '"', yes)] tokens.push ['+', '+'] tokens.pop() @tokens.push ['STRING', '""'], ['+', '+'] unless tokens[0]?[0] is 'STRING' @tokens.push tokens... @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags @token ')', ')' heregex.length # Matches newlines, indents, and outdents, and determines which is which. # If we can detect that the current line is continued onto the the next line, # then the newline is suppressed: # # elements # .each( ... ) # .map( ... ) # # Keeps track of the level of indentation, because a single outdent token # can close multiple indents, so we need to know how far in we happen to be. lineToken: -> return 0 unless match = MULTI_DENT.exec @chunk indent = match[0] @bumpLine "lineToken", count indent, '\n' @seenFor = no prev = last @tokens, 1 size = indent.length - 1 - indent.lastIndexOf '\n' noNewlines = @unfinished() if size - @indebt is @indent if noNewlines then @suppressNewlines() else @newlineToken() return indent.length if size > @indent if noNewlines @indebt = size - @indent @suppressNewlines() return indent.length diff = size - @indent + @outdebt @token 'INDENT', diff @indents.push diff @ends.push 'OUTDENT' @outdebt = @indebt = 0 else @indebt = 0 @outdentToken @indent - size, noNewlines @indent = size indent.length # Record an outdent token or multiple tokens, if we happen to be moving back # inwards past several recorded indents. outdentToken: (moveOut, noNewlines) -> while moveOut > 0 len = @indents.length - 1 if @indents[len] is undefined moveOut = 0 else if @indents[len] is @outdebt moveOut -= @outdebt @outdebt = 0 else if @indents[len] < @outdebt @outdebt -= @indents[len] moveOut -= @indents[len] else dent = @indents.pop() - @outdebt moveOut -= dent @outdebt = 0 @pair 'OUTDENT' @token 'OUTDENT', dent @outdebt -= moveOut if dent @tokens.pop() while @value() is ';' @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines this # Matches and consumes non-meaningful whitespace. Tag the previous token # as being "spaced", because there are some cases where it makes a difference. whitespaceToken: -> return 0 unless (match = WHITESPACE.exec @chunk) or (nline = @chunk.charAt(0) is '\n') prev = last @tokens prev[if match then 'spaced' else 'newLine'] = true if prev if match then match[0].length else 0 # Generate a newline token. Consecutive newlines get merged together. newlineToken: -> @tokens.pop() while @value() is ';' @line -= 1 @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' @line += 1 this # Use a `\` at a line-ending to suppress the newline. # The slash is removed here once its job is done. suppressNewlines: -> @tokens.pop() if @value() is '\\' this # We treat all other single characters as a token. E.g.: `( ) , . !` # Multi-character operators are also literal tokens, so that Jison can assign # the proper order of operations. There are some symbols that we tag specially # here. `;` and newlines are both treated as a `TERMINATOR`, we distinguish # parentheses that indicate a method call from regular parentheses, and so on. literalToken: -> if match = OPERATOR.exec @chunk [value] = match @tagParameters() if CODE.test value else value = @chunk.charAt 0 tag = value prev = last @tokens if value is '=' and prev if not prev[1].reserved and prev[1] in JS_FORBIDDEN @error "reserved word \"#{@value()}\" can't be assigned" if prev[1] in ['||', '&&'] prev[0] = 'COMPOUND_ASSIGN' prev[1] += '=' return value.length if value is ';' @seenFor = no tag = 'TERMINATOR' else if value in MATH then tag = 'MATH' else if value in COMPARE then tag = 'COMPARE' else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN' else if value in UNARY then tag = 'UNARY' else if value in SHIFT then tag = 'SHIFT' else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC' else if prev and not prev.spaced if value is '(' and prev[0] in CALLABLE prev[0] = 'FUNC_EXIST' if prev[0] is '?' tag = 'CALL_START' else if value is '[' and prev[0] in INDEXABLE tag = 'INDEX_START' switch prev[0] when '?' then prev[0] = 'INDEX_SOAK' switch value when '(', '{', '[' then @ends.push INVERSES[value] when ')', '}', ']' then @pair value @token tag, value value.length # Token Manipulators # ------------------ # Sanitize a heredoc or herecomment by # erasing all external indentation on the left-hand side. sanitizeHeredoc: (doc, options) -> {indent, herecomment} = options if herecomment if HEREDOC_ILLEGAL.test doc @error "block comment cannot contain \"*/\", starting" return doc if doc.indexOf('\n') <= 0 else while match = HEREDOC_INDENT.exec doc attempt = match[1] indent = attempt if indent is null or 0 < attempt.length < indent.length doc = doc.replace /// \n #{indent} ///g, '\n' if indent doc = doc.replace /^\n/, '' unless herecomment doc # A source of ambiguity in our grammar used to be parameter lists in function # definitions versus argument lists in function calls. Walk backwards, tagging # parameters specially in order to make things easier for the parser. tagParameters: -> return this if @tag() isnt ')' stack = [] {tokens} = this i = tokens.length tokens[--i][0] = 'PARAM_END' while tok = tokens[--i] switch tok[0] when ')' stack.push tok when '(', 'CALL_START' if stack.length then stack.pop() else if tok[0] is '(' tok[0] = 'PARAM_START' return this else return this this # Close up all remaining open blocks at the end of the file. closeIndentation: -> @outdentToken @indent # Matches a balanced group such as a single or double-quoted string. Pass in # a series of delimiters, all of which must be nested correctly within the # contents of the string. This method allows us to have strings within # interpolations within strings, ad infinitum. balancedString: (str, end) -> continueCount = 0 stack = [end] for i in [1...str.length] if continueCount --continueCount continue switch letter = str.charAt i when '\\' ++continueCount continue when end stack.pop() unless stack.length return str[0..i] end = stack[stack.length - 1] continue if end is '}' and letter in ['"', "'"] stack.push end = letter else if end is '}' and letter is '/' and match = (HEREGEX.exec(str[i..]) or REGEX.exec(str[i..])) continueCount += match[0].length - 1 else if end is '}' and letter is '{' stack.push end = '}' else if end is '"' and prev is '#' and letter is '{' stack.push end = '}' prev = letter @error "missing #{ stack.pop() }, starting" # Expand variables and expressions inside double-quoted strings using # Ruby-like notation for substitution of arbitrary expressions. # # "Hello #{name.capitalize()}." # # If it encounters an interpolation, this method will recursively create a # new Lexer, tokenize the interpolated contents, and merge them into the # token stream. interpolateString: (str, options = {}) -> {heredoc, regex} = options tokens = [] pi = 0 i = -1 while letter = str.charAt i += 1 if letter is '\\' i += 1 continue unless letter is '#' and str.charAt(i+1) is '{' and (expr = @balancedString str[i + 1..], '}') continue tokens.push ['NEOSTRING', str[pi...i]] if pi < i inner = expr[1...-1] if inner.length nested = new Lexer().tokenize inner, line: @line, rewrite: off nested.pop() nested.shift() if nested[0]?[0] is 'TERMINATOR' if len = nested.length if len > 1 nested.unshift ['(', '(', @line] nested.push [')', ')', @line] tokens.push ['TOKENS', nested] i += expr.length pi = i + 1 tokens.push ['NEOSTRING', str[pi..]] if i > pi < str.length return tokens if regex return @token 'STRING', '""' unless tokens.length tokens.unshift ['', ''] unless tokens[0][0] is 'NEOSTRING' @token '(', '(' if interpolated = tokens.length > 1 for [tag, value], i in tokens @token '+', '+' if i if tag is 'TOKENS' @tokens.push value... else @token 'STRING', @makeString value, '"', heredoc @token ')', ')' if interpolated tokens # Pairs up a closing token, ensuring that all listed pairs of tokens are # correctly balanced throughout the course of the token stream. pair: (tag) -> unless tag is wanted = last @ends @error "unmatched #{tag}" unless 'OUTDENT' is wanted # Auto-close INDENT to support syntax like this: # # el.click((event) -> # el.hide()) # @indent -= size = last @indents @outdentToken size, true return @pair tag @ends.pop() # Helpers # ------- # Add a token to the results, taking note of the line number. token: (tag, value) -> @tokens.push [tag, value, @tokenLine, @tokenColumn] @startLine = @line # Peek at a tag in the current token stream. tag: (index, tag) -> (tok = last @tokens, index) and if tag then tok[0] = tag else tok[0] # Peek at a value in the current token stream. value: (index, val) -> (tok = last @tokens, index) and if val then tok[1] = val else tok[1] # Are we in the midst of an unfinished expression? unfinished: -> LINE_CONTINUER.test(@chunk) or @tag() in ['\\', '.', '?.', 'UNARY', 'MATH', '+', '-', 'SHIFT', 'RELATION' 'COMPARE', 'LOGIC', 'THROW', 'EXTENDS'] # Converts newlines for string literals. escapeLines: (str, heredoc) -> str.replace MULTILINER, if heredoc then '\\n' else '' # Constructs a string token by escaping quotes and newlines. makeString: (body, quote, heredoc) -> return quote + quote unless body body = body.replace /\\([\s\S])/g, (match, contents) -> if contents in ['\n', quote] then contents else match body = body.replace /// #{quote} ///g, '\\$&' quote + @escapeLines(body, heredoc) + quote # Throws a syntax error on the current `@line`. error: (message) -> throw SyntaxError "#{message} on line #{ @line + 1}" # Constants # --------- # Keywords that CoffeeScript shares in common with JavaScript. JS_KEYWORDS = [ 'true', 'false', 'null', 'this' 'new', 'delete', 'typeof', 'in', 'instanceof' 'return', 'throw', 'break', 'continue', 'debugger' 'if', 'else', 'switch', 'for', 'while', 'do', 'try', 'catch', 'finally' 'class', 'extends', 'super' ] # CoffeeScript-only keywords. COFFEE_KEYWORDS = ['undefined', 'then', 'unless', 'until', 'loop', 'of', 'by', 'when'] COFFEE_ALIAS_MAP = and : '&&' or : '||' is : '==' isnt : '!=' not : '!' yes : 'true' no : 'false' on : 'true' off : 'false' COFFEE_ALIASES = (key for key of COFFEE_ALIAS_MAP) COFFEE_KEYWORDS = COFFEE_KEYWORDS.concat COFFEE_ALIASES # The list of keywords that are reserved by JavaScript, but not used, or are # used by CoffeeScript internally. We throw an error when these are encountered, # to avoid having a JavaScript error at runtime. RESERVED = [ 'case', 'default', 'function', 'var', 'void', 'with' 'const', 'let', 'enum', 'export', 'import', 'native' '__hasProp', '__extends', '__slice', '__bind', '__indexOf' ] # The superset of both JavaScript keywords and reserved words, none of which may # be used as identifiers or properties. JS_FORBIDDEN = JS_KEYWORDS.concat RESERVED exports.RESERVED = RESERVED.concat(JS_KEYWORDS).concat(COFFEE_KEYWORDS) # Token matching regexes. IDENTIFIER = /// ^ ( [$A-Za-z_\x7f-\uffff][$\w\x7f-\uffff]* ) ( [^\n\S]* : (?!:) )? # Is this a property name? /// NUMBER = /// ^ 0x[\da-f]+ | # hex ^ 0b[01]+ | # binary ^ \d*\.?\d+ (?:e[+-]?\d+)? # decimal ///i HEREDOC = /// ^ ("""|''') ([\s\S]*?) (?:\n[^\n\S]*)? \1 /// OPERATOR = /// ^ ( ?: [-=]> # function | [-+*/%<>&|^!?=]= # compound assign / compare | >>>=? # zero-fill right shift | ([-+:])\1 # doubles | ([&|<>])\2=? # logic / shift | \?\. # soak access | \.{2,3} # range or splat ) /// WHITESPACE = /^[^\n\S]+/ COMMENT = /^###([^#][\s\S]*?)(?:###[^\n\S]*|(?:###)?$)|^(?:\s*#(?!##[^#]).*)+/ CODE = /^[-=]>/ MULTI_DENT = /^(?:\n[^\n\S]*)+/ SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/ JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/ # Regex-matching-regexes. REGEX = /// ^ (/ (?! [\s=] ) # disallow leading whitespace or equals signs [^ [ / \n \\ ]* # every other thing (?: (?: \\[\s\S] # anything escaped | \[ # character class [^ \] \n \\ ]* (?: \\[\s\S] [^ \] \n \\ ]* )* ] ) [^ [ / \n \\ ]* )* /) ([imgy]{0,4}) (?!\w) /// HEREGEX = /// ^ /{3} ([\s\S]+?) /{3} ([imgy]{0,4}) (?!\w) /// HEREGEX_OMIT = /\s+(?:#.*)?/g # Token cleaning regexes. MULTILINER = /\n/g HEREDOC_INDENT = /\n+([^\n\S]*)/g HEREDOC_ILLEGAL = /\*\// LINE_CONTINUER = /// ^ \s* (?: , | \??\.(?![.\d]) | :: ) /// TRAILING_SPACES = /\s+$/ # Compound assignment tokens. COMPOUND_ASSIGN = [ '-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|=' ] # Unary tokens. UNARY = ['!', '~', 'NEW', 'TYPEOF', 'DELETE', 'DO'] # Logical tokens. LOGIC = ['&&', '||', '&', '|', '^'] # Bit-shifting tokens. SHIFT = ['<<', '>>', '>>>'] # Comparison tokens. COMPARE = ['==', '!=', '<', '>', '<=', '>='] # Mathematical tokens. MATH = ['*', '/', '%'] # Relational tokens that are negatable with `not` prefix. RELATION = ['IN', 'OF', 'INSTANCEOF'] # Boolean tokens. BOOL = ['TRUE', 'FALSE', 'NULL', 'UNDEFINED'] # Tokens which a regular expression will never immediately follow, but which # a division operator might. # # See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions # # Our list is shorter, due to sans-parentheses method calls. NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', '++', '--', ']'] # If the previous token is not spaced, there are more preceding tokens that # force a division parse: NOT_SPACED_REGEX = NOT_REGEX.concat ')', '}', 'THIS', 'IDENTIFIER', 'STRING' # Tokens which could legitimately be invoked or indexed. An opening # parentheses or bracket following these tokens will be recorded as the start # of a function invocation or indexing operation. CALLABLE = ['IDENTIFIER', 'STRING', 'REGEX', ')', ']', '}', '?', '::', '@', 'THIS', 'SUPER'] INDEXABLE = CALLABLE.concat 'NUMBER', 'BOOL' # Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` # occurs at the start of a line. We disambiguate these from trailing whens to # avoid an ambiguity in the grammar. LINE_BREAK = ['INDENT', 'OUTDENT', 'TERMINATOR']