lib/MC/MCParser/AsmLexer.cpp

   1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This class implements the lexer for assembly files.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/MC/MCParser/AsmLexer.h"
  15 #include "llvm/MC/MCAsmInfo.h"
  16 #include "llvm/Support/MemoryBuffer.h"
  17 #include "llvm/Support/SMLoc.h"
  18 #include <cctype>
  19 #include <cerrno>
  20 #include <cstdio>
  21 #include <cstdlib>
  22 using namespace llvm;
  23
  24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
  25   CurBuf = NULL;
  26   CurPtr = NULL;
  27   isAtStartOfLine = true;
  28   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
  29 }
  30
  31 AsmLexer::~AsmLexer() {
  32 }
  33
  34 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
  35   CurBuf = buf;
  36
  37   if (ptr)
  38     CurPtr = ptr;
  39   else
  40     CurPtr = CurBuf->getBufferStart();
  41
  42   TokStart = 0;
  43 }
  44
  45 /// ReturnError - Set the error to the specified string at the specified
  46 /// location.  This is defined to always return AsmToken::Error.
  47 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
  48   SetError(SMLoc::getFromPointer(Loc), Msg);
  49
  50   return AsmToken(AsmToken::Error, StringRef(Loc, 0));
  51 }
  52
  53 int AsmLexer::getNextChar() {
  54   char CurChar = *CurPtr++;
  55   switch (CurChar) {
  56   default:
  57     return (unsigned char)CurChar;
  58   case 0:
  59     // A nul character in the stream is either the end of the current buffer or
  60     // a random nul in the file.  Disambiguate that here.
  61     if (CurPtr-1 != CurBuf->getBufferEnd())
  62       return 0;  // Just whitespace.
  63
  64     // Otherwise, return end of file.
  65     --CurPtr;  // Another call to lex will return EOF again.
  66     return EOF;
  67   }
  68 }
  69
  70 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
  71 ///
  72 /// The leading integral digit sequence and dot should have already been
  73 /// consumed, some or all of the fractional digit sequence *can* have been
  74 /// consumed.
  75 AsmToken AsmLexer::LexFloatLiteral() {
  76   // Skip the fractional digit sequence.
  77   while (isdigit(*CurPtr))
  78     ++CurPtr;
  79
  80   // Check for exponent; we intentionally accept a slighlty wider set of
  81   // literals here and rely on the upstream client to reject invalid ones (e.g.,
  82   // "1e+").
  83   if (*CurPtr == 'e' || *CurPtr == 'E') {
  84     ++CurPtr;
  85     if (*CurPtr == '-' || *CurPtr == '+')
  86       ++CurPtr;
  87     while (isdigit(*CurPtr))
  88       ++CurPtr;
  89   }
  90
  91   return AsmToken(AsmToken::Real,
  92                   StringRef(TokStart, CurPtr - TokStart));
  93 }
  94
  95 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
  96 /// while making sure there are enough actual digits around for the constant to
  97 /// be valid.
  98 ///
  99 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
 100 /// before we get here.
 101 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
 102   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
 103          "unexpected parse state in floating hex");
 104   bool NoFracDigits = true;
 105
 106   // Skip the fractional part if there is one
 107   if (*CurPtr == '.') {
 108     ++CurPtr;
 109
 110     const char *FracStart = CurPtr;
 111     while (isxdigit(*CurPtr))
 112       ++CurPtr;
 113
 114     NoFracDigits = CurPtr == FracStart;
 115   }
 116
 117   if (NoIntDigits && NoFracDigits)
 118     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 119                                  "expected at least one significand digit");
 120
 121   // Make sure we do have some kind of proper exponent part
 122   if (*CurPtr != 'p' && *CurPtr != 'P')
 123     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 124                                  "expected exponent part 'p'");
 125   ++CurPtr;
 126
 127   if (*CurPtr == '+' || *CurPtr == '-')
 128     ++CurPtr;
 129
 130   // N.b. exponent digits are *not* hex
 131   const char *ExpStart = CurPtr;
 132   while (isdigit(*CurPtr))
 133     ++CurPtr;
 134
 135   if (CurPtr == ExpStart)
 136     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 137                                  "expected at least one exponent digit");
 138
 139   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 140 }
 141
 142 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 143 static bool IsIdentifierChar(char c, bool AllowAt) {
 144   return isalnum(c) || c == '_' || c == '$' || c == '.' ||
 145          (c == '@' && AllowAt) || c == '?';
 146 }
 147 AsmToken AsmLexer::LexIdentifier() {
 148   // Check for floating point literals.
 149   if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
 150     // Disambiguate a .1243foo identifier from a floating literal.
 151     while (isdigit(*CurPtr))
 152       ++CurPtr;
 153     if (*CurPtr == 'e' || *CurPtr == 'E' ||
 154         !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
 155       return LexFloatLiteral();
 156   }
 157
 158   while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
 159     ++CurPtr;
 160
 161   // Handle . as a special case.
 162   if (CurPtr == TokStart+1 && TokStart[0] == '.')
 163     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
 164
 165   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
 166 }
 167
 168 /// LexSlash: Slash: /
 169 ///           C-Style Comment: /* ... */
 170 AsmToken AsmLexer::LexSlash() {
 171   switch (*CurPtr) {
 172   case '*': break; // C style comment.
 173   case '/': return ++CurPtr, LexLineComment();
 174   default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
 175   }
 176
 177   // C Style comment.
 178   ++CurPtr;  // skip the star.
 179   while (1) {
 180     int CurChar = getNextChar();
 181     switch (CurChar) {
 182     case EOF:
 183       return ReturnError(TokStart, "unterminated comment");
 184     case '*':
 185       // End of the comment?
 186       if (CurPtr[0] != '/') break;
 187
 188       ++CurPtr;   // End the */.
 189       return LexToken();
 190     }
 191   }
 192 }
 193
 194 /// LexLineComment: Comment: #[^\n]*
 195 ///                        : //[^\n]*
 196 AsmToken AsmLexer::LexLineComment() {
 197   // FIXME: This is broken if we happen to a comment at the end of a file, which
 198   // was .included, and which doesn't end with a newline.
 199   int CurChar = getNextChar();
 200   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
 201     CurChar = getNextChar();
 202
 203   if (CurChar == EOF)
 204     return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
 205   return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
 206 }
 207
 208 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 209   // Skip ULL, UL, U, L and LL suffices.
 210   if (CurPtr[0] == 'U')
 211     ++CurPtr;
 212   if (CurPtr[0] == 'L')
 213     ++CurPtr;
 214   if (CurPtr[0] == 'L')
 215     ++CurPtr;
 216 }
 217
 218 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
 219 // integer as a hexadecimal, possibly with leading zeroes.
 220 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
 221   const char *FirstHex = 0;
 222   const char *LookAhead = CurPtr;
 223   while (1) {
 224     if (isdigit(*LookAhead)) {
 225       ++LookAhead;
 226     } else if (isxdigit(*LookAhead)) {
 227       if (!FirstHex)
 228         FirstHex = LookAhead;
 229       ++LookAhead;
 230     } else {
 231       break;
 232     }
 233   }
 234   bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
 235   CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
 236   if (isHex)
 237     return 16;
 238   return DefaultRadix;
 239 }
 240
 241 /// LexDigit: First character is [0-9].
 242 ///   Local Label: [0-9][:]
 243 ///   Forward/Backward Label: [0-9][fb]
 244 ///   Binary integer: 0b[01]+
 245 ///   Octal integer: 0[0-7]+
 246 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
 247 ///   Decimal integer: [1-9][0-9]*
 248 AsmToken AsmLexer::LexDigit() {
 249   // Decimal integer: [1-9][0-9]*
 250   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
 251     unsigned Radix = doLookAhead(CurPtr, 10);
 252     bool isHex = Radix == 16;
 253     // Check for floating point literals.
 254     if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
 255       ++CurPtr;
 256       return LexFloatLiteral();
 257     }
 258
 259     StringRef Result(TokStart, CurPtr - TokStart);
 260
 261     long long Value;
 262     if (Result.getAsInteger(Radix, Value)) {
 263       // Allow positive values that are too large to fit into a signed 64-bit
 264       // integer, but that do fit in an unsigned one, we just convert them over.
 265       unsigned long long UValue;
 266       if (Result.getAsInteger(Radix, UValue))
 267         return ReturnError(TokStart, !isHex ? "invalid decimal number" :
 268                            "invalid hexdecimal number");
 269       Value = (long long)UValue;
 270     }
 271
 272     // Consume the [bB][hH].
 273     if (Radix == 2 || Radix == 16)
 274       ++CurPtr;
 275
 276     // The darwin/x86 (and x86-64) assembler accepts and ignores type
 277     // suffices on integer literals.
 278     SkipIgnoredIntegerSuffix(CurPtr);
 279
 280     return AsmToken(AsmToken::Integer, Result, Value);
 281   }
 282
 283   if (*CurPtr == 'b') {
 284     ++CurPtr;
 285     // See if we actually have "0b" as part of something like "jmp 0b\n"
 286     if (!isdigit(CurPtr[0])) {
 287       --CurPtr;
 288       StringRef Result(TokStart, CurPtr - TokStart);
 289       return AsmToken(AsmToken::Integer, Result, 0);
 290     }
 291     const char *NumStart = CurPtr;
 292     while (CurPtr[0] == '0' || CurPtr[0] == '1')
 293       ++CurPtr;
 294
 295     // Requires at least one binary digit.
 296     if (CurPtr == NumStart)
 297       return ReturnError(TokStart, "invalid binary number");
 298
 299     StringRef Result(TokStart, CurPtr - TokStart);
 300
 301     long long Value;
 302     if (Result.substr(2).getAsInteger(2, Value))
 303       return ReturnError(TokStart, "invalid binary number");
 304
 305     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 306     // suffixes on integer literals.
 307     SkipIgnoredIntegerSuffix(CurPtr);
 308
 309     return AsmToken(AsmToken::Integer, Result, Value);
 310   }
 311
 312   if (*CurPtr == 'x') {
 313     ++CurPtr;
 314     const char *NumStart = CurPtr;
 315     while (isxdigit(CurPtr[0]))
 316       ++CurPtr;
 317
 318     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
 319     // diagnosed by LexHexFloatLiteral).
 320     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
 321       return LexHexFloatLiteral(NumStart == CurPtr);
 322
 323     // Otherwise requires at least one hex digit.
 324     if (CurPtr == NumStart)
 325       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 326
 327     unsigned long long Result;
 328     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
 329       return ReturnError(TokStart, "invalid hexadecimal number");
 330
 331     // Consume the optional [hH].
 332     if (*CurPtr == 'h' || *CurPtr == 'H')
 333       ++CurPtr;
 334
 335     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 336     // suffixes on integer literals.
 337     SkipIgnoredIntegerSuffix(CurPtr);
 338
 339     return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
 340                     (int64_t)Result);
 341   }
 342
 343   // Either octal or hexadecimal.
 344   long long Value;
 345   unsigned Radix = doLookAhead(CurPtr, 8);
 346   bool isHex = Radix == 16;
 347   StringRef Result(TokStart, CurPtr - TokStart);
 348   if (Result.getAsInteger(Radix, Value))
 349     return ReturnError(TokStart, !isHex ? "invalid octal number" :
 350                        "invalid hexdecimal number");
 351
 352   // Consume the [hH].
 353   if (Radix == 16)
 354     ++CurPtr;
 355
 356   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 357   // suffixes on integer literals.
 358   SkipIgnoredIntegerSuffix(CurPtr);
 359
 360   return AsmToken(AsmToken::Integer, Result, Value);
 361 }
 362
 363 /// LexSingleQuote: Integer: 'b'
 364 AsmToken AsmLexer::LexSingleQuote() {
 365   int CurChar = getNextChar();
 366
 367   if (CurChar == '\\')
 368     CurChar = getNextChar();
 369
 370   if (CurChar == EOF)
 371     return ReturnError(TokStart, "unterminated single quote");
 372
 373   CurChar = getNextChar();
 374
 375   if (CurChar != '\'')
 376     return ReturnError(TokStart, "single quote way too long");
 377
 378   // The idea here being that 'c' is basically just an integral
 379   // constant.
 380   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
 381   long long Value;
 382
 383   if (Res.startswith("\'\\")) {
 384     char theChar = Res[2];
 385     switch (theChar) {
 386       default: Value = theChar; break;
 387       case '\'': Value = '\''; break;
 388       case 't': Value = '\t'; break;
 389       case 'n': Value = '\n'; break;
 390       case 'b': Value = '\b'; break;
 391     }
 392   } else
 393     Value = TokStart[1];
 394
 395   return AsmToken(AsmToken::Integer, Res, Value);
 396 }
 397
 398
 399 /// LexQuote: String: "..."
 400 AsmToken AsmLexer::LexQuote() {
 401   int CurChar = getNextChar();
 402   // TODO: does gas allow multiline string constants?
 403   while (CurChar != '"') {
 404     if (CurChar == '\\') {
 405       // Allow \", etc.
 406       CurChar = getNextChar();
 407     }
 408
 409     if (CurChar == EOF)
 410       return ReturnError(TokStart, "unterminated string constant");
 411
 412     CurChar = getNextChar();
 413   }
 414
 415   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
 416 }
 417
 418 StringRef AsmLexer::LexUntilEndOfStatement() {
 419   TokStart = CurPtr;
 420
 421   while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
 422          !isAtStatementSeparator(CurPtr) && // End of statement marker.
 423          *CurPtr != '\n' &&
 424          *CurPtr != '\r' &&
 425          (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
 426     ++CurPtr;
 427   }
 428   return StringRef(TokStart, CurPtr-TokStart);
 429 }
 430
 431 StringRef AsmLexer::LexUntilEndOfLine() {
 432   TokStart = CurPtr;
 433
 434   while (*CurPtr != '\n' &&
 435          *CurPtr != '\r' &&
 436          (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
 437     ++CurPtr;
 438   }
 439   return StringRef(TokStart, CurPtr-TokStart);
 440 }
 441
 442 bool AsmLexer::isAtStartOfComment(char Char) {
 443   // FIXME: This won't work for multi-character comment indicators like "//".
 444   return Char == *MAI.getCommentString();
 445 }
 446
 447 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
 448   return strncmp(Ptr, MAI.getSeparatorString(),
 449                  strlen(MAI.getSeparatorString())) == 0;
 450 }
 451
 452 AsmToken AsmLexer::LexToken() {
 453   TokStart = CurPtr;
 454   // This always consumes at least one character.
 455   int CurChar = getNextChar();
 456
 457   if (isAtStartOfComment(CurChar)) {
 458     // If this comment starts with a '#', then return the Hash token and let
 459     // the assembler parser see if it can be parsed as a cpp line filename
 460     // comment. We do this only if we are at the start of a line.
 461     if (CurChar == '#' && isAtStartOfLine)
 462       return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
 463     isAtStartOfLine = true;
 464     return LexLineComment();
 465   }
 466   if (isAtStatementSeparator(TokStart)) {
 467     CurPtr += strlen(MAI.getSeparatorString()) - 1;
 468     return AsmToken(AsmToken::EndOfStatement,
 469                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
 470   }
 471
 472   // If we're missing a newline at EOF, make sure we still get an
 473   // EndOfStatement token before the Eof token.
 474   if (CurChar == EOF && !isAtStartOfLine) {
 475     isAtStartOfLine = true;
 476     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
 477   }
 478
 479   isAtStartOfLine = false;
 480   switch (CurChar) {
 481   default:
 482     // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
 483     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
 484       return LexIdentifier();
 485
 486     // Unknown character, emit an error.
 487     return ReturnError(TokStart, "invalid character in input");
 488   case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
 489   case 0:
 490   case ' ':
 491   case '\t':
 492     if (SkipSpace) {
 493       // Ignore whitespace.
 494       return LexToken();
 495     } else {
 496       int len = 1;
 497       while (*CurPtr==' ' || *CurPtr=='\t') {
 498         CurPtr++;
 499         len++;
 500       }
 501       return AsmToken(AsmToken::Space, StringRef(TokStart, len));
 502     }
 503   case '\n': // FALL THROUGH.
 504   case '\r':
 505     isAtStartOfLine = true;
 506     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
 507   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
 508   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
 509   case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
 510   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
 511   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
 512   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
 513   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
 514   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
 515   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
 516   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
 517   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
 518   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
 519   case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
 520   case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
 521   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
 522   case '=':
 523     if (*CurPtr == '=')
 524       return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
 525     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
 526   case '|':
 527     if (*CurPtr == '|')
 528       return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
 529     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
 530   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
 531   case '&':
 532     if (*CurPtr == '&')
 533       return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
 534     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
 535   case '!':
 536     if (*CurPtr == '=')
 537       return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
 538     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
 539   case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
 540   case '/': return LexSlash();
 541   case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
 542   case '\'': return LexSingleQuote();
 543   case '"': return LexQuote();
 544   case '0': case '1': case '2': case '3': case '4':
 545   case '5': case '6': case '7': case '8': case '9':
 546     return LexDigit();
 547   case '<':
 548     switch (*CurPtr) {
 549     case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
 550                                         StringRef(TokStart, 2));
 551     case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
 552                                         StringRef(TokStart, 2));
 553     case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
 554                                         StringRef(TokStart, 2));
 555     default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
 556     }
 557   case '>':
 558     switch (*CurPtr) {
 559     case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
 560                                         StringRef(TokStart, 2));
 561     case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
 562                                         StringRef(TokStart, 2));
 563     default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
 564     }
 565
 566   // TODO: Quoted identifiers (objc methods etc)
 567   // local labels: [0-9][:]
 568   // Forward/backward labels: [0-9][fb]
 569   // Integers, fp constants, character constants.
 570   }
 571 }