1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This class implements the lexer for assembly files.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/MC/MCParser/AsmLexer.h"
15 #include "llvm/MC/MCAsmInfo.h"
16 #include "llvm/Support/MemoryBuffer.h"
17 #include "llvm/Support/SMLoc.h"
24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) {
26 isAtStartOfLine = true;
27 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
30 AsmLexer::~AsmLexer() {
33 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
39 CurPtr = CurBuf.begin();
44 /// ReturnError - Set the error to the specified string at the specified
45 /// location. This is defined to always return AsmToken::Error.
46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
47 SetError(SMLoc::getFromPointer(Loc), Msg);
49 return AsmToken(AsmToken::Error, StringRef(Loc, 0));
52 int AsmLexer::getNextChar() {
53 char CurChar = *CurPtr++;
56 return (unsigned char)CurChar;
58 // A nul character in the stream is either the end of the current buffer or
59 // a random nul in the file. Disambiguate that here.
60 if (CurPtr - 1 != CurBuf.end())
61 return 0; // Just whitespace.
63 // Otherwise, return end of file.
64 --CurPtr; // Another call to lex will return EOF again.
69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
71 /// The leading integral digit sequence and dot should have already been
72 /// consumed, some or all of the fractional digit sequence *can* have been
74 AsmToken AsmLexer::LexFloatLiteral() {
75 // Skip the fractional digit sequence.
76 while (isdigit(*CurPtr))
79 // Check for exponent; we intentionally accept a slighlty wider set of
80 // literals here and rely on the upstream client to reject invalid ones (e.g.,
82 if (*CurPtr == 'e' || *CurPtr == 'E') {
84 if (*CurPtr == '-' || *CurPtr == '+')
86 while (isdigit(*CurPtr))
90 return AsmToken(AsmToken::Real,
91 StringRef(TokStart, CurPtr - TokStart));
94 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95 /// while making sure there are enough actual digits around for the constant to
98 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99 /// before we get here.
100 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102 "unexpected parse state in floating hex");
103 bool NoFracDigits = true;
105 // Skip the fractional part if there is one
106 if (*CurPtr == '.') {
109 const char *FracStart = CurPtr;
110 while (isxdigit(*CurPtr))
113 NoFracDigits = CurPtr == FracStart;
116 if (NoIntDigits && NoFracDigits)
117 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118 "expected at least one significand digit");
120 // Make sure we do have some kind of proper exponent part
121 if (*CurPtr != 'p' && *CurPtr != 'P')
122 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123 "expected exponent part 'p'");
126 if (*CurPtr == '+' || *CurPtr == '-')
129 // N.b. exponent digits are *not* hex
130 const char *ExpStart = CurPtr;
131 while (isdigit(*CurPtr))
134 if (CurPtr == ExpStart)
135 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136 "expected at least one exponent digit");
138 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
141 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142 static bool IsIdentifierChar(char c, bool AllowAt) {
143 return isalnum(c) || c == '_' || c == '$' || c == '.' ||
144 (c == '@' && AllowAt) || c == '?';
146 AsmToken AsmLexer::LexIdentifier() {
147 // Check for floating point literals.
148 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
149 // Disambiguate a .1243foo identifier from a floating literal.
150 while (isdigit(*CurPtr))
152 if (*CurPtr == 'e' || *CurPtr == 'E' ||
153 !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
154 return LexFloatLiteral();
157 while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
160 // Handle . as a special case.
161 if (CurPtr == TokStart+1 && TokStart[0] == '.')
162 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
164 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
167 /// LexSlash: Slash: /
168 /// C-Style Comment: /* ... */
169 AsmToken AsmLexer::LexSlash() {
171 case '*': break; // C style comment.
172 case '/': return ++CurPtr, LexLineComment();
173 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
177 ++CurPtr; // skip the star.
179 int CurChar = getNextChar();
182 return ReturnError(TokStart, "unterminated comment");
184 // End of the comment?
185 if (CurPtr[0] != '/') break;
187 ++CurPtr; // End the */.
193 /// LexLineComment: Comment: #[^\n]*
195 AsmToken AsmLexer::LexLineComment() {
196 // FIXME: This is broken if we happen to a comment at the end of a file, which
197 // was .included, and which doesn't end with a newline.
198 int CurChar = getNextChar();
199 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
200 CurChar = getNextChar();
203 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
204 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
207 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
208 // Skip ULL, UL, U, L and LL suffices.
209 if (CurPtr[0] == 'U')
211 if (CurPtr[0] == 'L')
213 if (CurPtr[0] == 'L')
217 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
218 // integer as a hexadecimal, possibly with leading zeroes.
219 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
220 const char *FirstHex = nullptr;
221 const char *LookAhead = CurPtr;
223 if (isdigit(*LookAhead)) {
225 } else if (isxdigit(*LookAhead)) {
227 FirstHex = LookAhead;
233 bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
234 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
240 static AsmToken intToken(StringRef Ref, APInt &Value)
242 if (Value.isIntN(64))
243 return AsmToken(AsmToken::Integer, Ref, Value);
244 return AsmToken(AsmToken::BigNum, Ref, Value);
247 /// LexDigit: First character is [0-9].
248 /// Local Label: [0-9][:]
249 /// Forward/Backward Label: [0-9][fb]
250 /// Binary integer: 0b[01]+
251 /// Octal integer: 0[0-7]+
252 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
253 /// Decimal integer: [1-9][0-9]*
254 AsmToken AsmLexer::LexDigit() {
255 // Decimal integer: [1-9][0-9]*
256 if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
257 unsigned Radix = doLookAhead(CurPtr, 10);
258 bool isHex = Radix == 16;
259 // Check for floating point literals.
260 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
262 return LexFloatLiteral();
265 StringRef Result(TokStart, CurPtr - TokStart);
267 APInt Value(128, 0, true);
268 if (Result.getAsInteger(Radix, Value))
269 return ReturnError(TokStart, !isHex ? "invalid decimal number" :
270 "invalid hexdecimal number");
272 // Consume the [bB][hH].
273 if (Radix == 2 || Radix == 16)
276 // The darwin/x86 (and x86-64) assembler accepts and ignores type
277 // suffices on integer literals.
278 SkipIgnoredIntegerSuffix(CurPtr);
280 return intToken(Result, Value);
283 if (*CurPtr == 'b') {
285 // See if we actually have "0b" as part of something like "jmp 0b\n"
286 if (!isdigit(CurPtr[0])) {
288 StringRef Result(TokStart, CurPtr - TokStart);
289 return AsmToken(AsmToken::Integer, Result, 0);
291 const char *NumStart = CurPtr;
292 while (CurPtr[0] == '0' || CurPtr[0] == '1')
295 // Requires at least one binary digit.
296 if (CurPtr == NumStart)
297 return ReturnError(TokStart, "invalid binary number");
299 StringRef Result(TokStart, CurPtr - TokStart);
301 APInt Value(128, 0, true);
302 if (Result.substr(2).getAsInteger(2, Value))
303 return ReturnError(TokStart, "invalid binary number");
305 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
306 // suffixes on integer literals.
307 SkipIgnoredIntegerSuffix(CurPtr);
309 return intToken(Result, Value);
312 if (*CurPtr == 'x') {
314 const char *NumStart = CurPtr;
315 while (isxdigit(CurPtr[0]))
318 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
319 // diagnosed by LexHexFloatLiteral).
320 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
321 return LexHexFloatLiteral(NumStart == CurPtr);
323 // Otherwise requires at least one hex digit.
324 if (CurPtr == NumStart)
325 return ReturnError(CurPtr-2, "invalid hexadecimal number");
327 APInt Result(128, 0);
328 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
329 return ReturnError(TokStart, "invalid hexadecimal number");
331 // Consume the optional [hH].
332 if (*CurPtr == 'h' || *CurPtr == 'H')
335 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
336 // suffixes on integer literals.
337 SkipIgnoredIntegerSuffix(CurPtr);
339 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
342 // Either octal or hexadecimal.
343 APInt Value(128, 0, true);
344 unsigned Radix = doLookAhead(CurPtr, 8);
345 bool isHex = Radix == 16;
346 StringRef Result(TokStart, CurPtr - TokStart);
347 if (Result.getAsInteger(Radix, Value))
348 return ReturnError(TokStart, !isHex ? "invalid octal number" :
349 "invalid hexdecimal number");
355 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
356 // suffixes on integer literals.
357 SkipIgnoredIntegerSuffix(CurPtr);
359 return intToken(Result, Value);
362 /// LexSingleQuote: Integer: 'b'
363 AsmToken AsmLexer::LexSingleQuote() {
364 int CurChar = getNextChar();
367 CurChar = getNextChar();
370 return ReturnError(TokStart, "unterminated single quote");
372 CurChar = getNextChar();
375 return ReturnError(TokStart, "single quote way too long");
377 // The idea here being that 'c' is basically just an integral
379 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
382 if (Res.startswith("\'\\")) {
383 char theChar = Res[2];
385 default: Value = theChar; break;
386 case '\'': Value = '\''; break;
387 case 't': Value = '\t'; break;
388 case 'n': Value = '\n'; break;
389 case 'b': Value = '\b'; break;
394 return AsmToken(AsmToken::Integer, Res, Value);
398 /// LexQuote: String: "..."
399 AsmToken AsmLexer::LexQuote() {
400 int CurChar = getNextChar();
401 // TODO: does gas allow multiline string constants?
402 while (CurChar != '"') {
403 if (CurChar == '\\') {
405 CurChar = getNextChar();
409 return ReturnError(TokStart, "unterminated string constant");
411 CurChar = getNextChar();
414 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
417 StringRef AsmLexer::LexUntilEndOfStatement() {
420 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
421 !isAtStatementSeparator(CurPtr) && // End of statement marker.
422 *CurPtr != '\n' && *CurPtr != '\r' &&
423 (*CurPtr != 0 || CurPtr != CurBuf.end())) {
426 return StringRef(TokStart, CurPtr-TokStart);
429 StringRef AsmLexer::LexUntilEndOfLine() {
432 while (*CurPtr != '\n' && *CurPtr != '\r' &&
433 (*CurPtr != 0 || CurPtr != CurBuf.end())) {
436 return StringRef(TokStart, CurPtr-TokStart);
439 const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
440 const char *SavedTokStart = TokStart;
441 const char *SavedCurPtr = CurPtr;
442 bool SavedAtStartOfLine = isAtStartOfLine;
443 bool SavedSkipSpace = SkipSpace;
445 std::string SavedErr = getErr();
446 SMLoc SavedErrLoc = getErrLoc();
448 SkipSpace = ShouldSkipSpace;
449 AsmToken Token = LexToken();
451 SetError(SavedErrLoc, SavedErr);
453 SkipSpace = SavedSkipSpace;
454 isAtStartOfLine = SavedAtStartOfLine;
455 CurPtr = SavedCurPtr;
456 TokStart = SavedTokStart;
461 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
462 const char *CommentString = MAI.getCommentString();
464 if (CommentString[1] == '\0')
465 return CommentString[0] == Ptr[0];
467 // FIXME: special case for the bogus "##" comment string in X86MCAsmInfoDarwin
468 if (CommentString[1] == '#')
469 return CommentString[0] == Ptr[0];
471 return strncmp(Ptr, CommentString, strlen(CommentString)) == 0;
474 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
475 return strncmp(Ptr, MAI.getSeparatorString(),
476 strlen(MAI.getSeparatorString())) == 0;
479 AsmToken AsmLexer::LexToken() {
481 // This always consumes at least one character.
482 int CurChar = getNextChar();
484 if (isAtStartOfComment(TokStart)) {
485 // If this comment starts with a '#', then return the Hash token and let
486 // the assembler parser see if it can be parsed as a cpp line filename
487 // comment. We do this only if we are at the start of a line.
488 if (CurChar == '#' && isAtStartOfLine)
489 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
490 isAtStartOfLine = true;
491 return LexLineComment();
493 if (isAtStatementSeparator(TokStart)) {
494 CurPtr += strlen(MAI.getSeparatorString()) - 1;
495 return AsmToken(AsmToken::EndOfStatement,
496 StringRef(TokStart, strlen(MAI.getSeparatorString())));
499 // If we're missing a newline at EOF, make sure we still get an
500 // EndOfStatement token before the Eof token.
501 if (CurChar == EOF && !isAtStartOfLine) {
502 isAtStartOfLine = true;
503 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
506 isAtStartOfLine = false;
509 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
510 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
511 return LexIdentifier();
513 // Unknown character, emit an error.
514 return ReturnError(TokStart, "invalid character in input");
515 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
520 // Ignore whitespace.
524 while (*CurPtr==' ' || *CurPtr=='\t') {
528 return AsmToken(AsmToken::Space, StringRef(TokStart, len));
530 case '\n': // FALL THROUGH.
532 isAtStartOfLine = true;
533 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
534 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
535 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
536 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
537 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
538 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
539 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
540 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
541 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
542 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
543 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
544 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
545 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
546 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
547 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
548 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
551 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
552 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
555 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
556 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
557 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
560 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
561 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
564 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
565 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
566 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
567 case '/': return LexSlash();
568 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
569 case '\'': return LexSingleQuote();
570 case '"': return LexQuote();
571 case '0': case '1': case '2': case '3': case '4':
572 case '5': case '6': case '7': case '8': case '9':
576 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
577 StringRef(TokStart, 2));
578 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
579 StringRef(TokStart, 2));
580 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
581 StringRef(TokStart, 2));
582 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
586 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
587 StringRef(TokStart, 2));
588 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
589 StringRef(TokStart, 2));
590 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
593 // TODO: Quoted identifiers (objc methods etc)
594 // local labels: [0-9][:]
595 // Forward/backward labels: [0-9][fb]
596 // Integers, fp constants, character constants.