utils/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // Implement the Lexer for TableGen.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "TGLexer.h"
  15 #include "llvm/ADT/Twine.h"
  16 #include "llvm/Support/SourceMgr.h"
  17 #include "llvm/Support/MemoryBuffer.h"
  18 #include "llvm/Config/config.h"
  19 #include <cctype>
  20 #include <cstdio>
  21 #include <cstdlib>
  22 #include <cstring>
  23 #include <cerrno>
  24 using namespace llvm;
  25
  26 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
  27   CurBuffer = 0;
  28   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  29   CurPtr = CurBuf->getBufferStart();
  30   TokStart = 0;
  31 }
  32
  33 SMLoc TGLexer::getLoc() const {
  34   return SMLoc::getFromPointer(TokStart);
  35 }
  36
  37
  38 /// ReturnError - Set the error to the specified string at the specified
  39 /// location.  This is defined to always return tgtok::Error.
  40 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
  41   PrintError(Loc, Msg);
  42   return tgtok::Error;
  43 }
  44
  45
  46 void TGLexer::PrintError(const char *Loc, const Twine &Msg) const {
  47   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), Msg, "error");
  48 }
  49
  50 void TGLexer::PrintError(SMLoc Loc, const Twine &Msg) const {
  51   SrcMgr.PrintMessage(Loc, Msg, "error");
  52 }
  53
  54
  55 int TGLexer::getNextChar() {
  56   char CurChar = *CurPtr++;
  57   switch (CurChar) {
  58   default:
  59     return (unsigned char)CurChar;
  60   case 0: {
  61     // A nul character in the stream is either the end of the current buffer or
  62     // a random nul in the file.  Disambiguate that here.
  63     if (CurPtr-1 != CurBuf->getBufferEnd())
  64       return 0;  // Just whitespace.
  65
  66     // If this is the end of an included file, pop the parent file off the
  67     // include stack.
  68     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
  69     if (ParentIncludeLoc != SMLoc()) {
  70       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
  71       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  72       CurPtr = ParentIncludeLoc.getPointer();
  73       return getNextChar();
  74     }
  75
  76     // Otherwise, return end of file.
  77     --CurPtr;  // Another call to lex will return EOF again.
  78     return EOF;
  79   }
  80   case '\n':
  81   case '\r':
  82     // Handle the newline character by ignoring it and incrementing the line
  83     // count.  However, be careful about 'dos style' files with \n\r in them.
  84     // Only treat a \n\r or \r\n as a single line.
  85     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
  86         *CurPtr != CurChar)
  87       ++CurPtr;  // Eat the two char newline sequence.
  88     return '\n';
  89   }
  90 }
  91
  92 tgtok::TokKind TGLexer::LexToken() {
  93   TokStart = CurPtr;
  94   // This always consumes at least one character.
  95   int CurChar = getNextChar();
  96
  97   switch (CurChar) {
  98   default:
  99     // Handle letters: [a-zA-Z_#]
 100     if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
 101       return LexIdentifier();
 102
 103     // Unknown character, emit an error.
 104     return ReturnError(TokStart, "Unexpected character");
 105   case EOF: return tgtok::Eof;
 106   case ':': return tgtok::colon;
 107   case ';': return tgtok::semi;
 108   case '.': return tgtok::period;
 109   case ',': return tgtok::comma;
 110   case '<': return tgtok::less;
 111   case '>': return tgtok::greater;
 112   case ']': return tgtok::r_square;
 113   case '{': return tgtok::l_brace;
 114   case '}': return tgtok::r_brace;
 115   case '(': return tgtok::l_paren;
 116   case ')': return tgtok::r_paren;
 117   case '=': return tgtok::equal;
 118   case '?': return tgtok::question;
 119
 120   case 0:
 121   case ' ':
 122   case '\t':
 123   case '\n':
 124   case '\r':
 125     // Ignore whitespace.
 126     return LexToken();
 127   case '/':
 128     // If this is the start of a // comment, skip until the end of the line or
 129     // the end of the buffer.
 130     if (*CurPtr == '/')
 131       SkipBCPLComment();
 132     else if (*CurPtr == '*') {
 133       if (SkipCComment())
 134         return tgtok::Error;
 135     } else // Otherwise, this is an error.
 136       return ReturnError(TokStart, "Unexpected character");
 137     return LexToken();
 138   case '-': case '+':
 139   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 140   case '7': case '8': case '9':
 141     return LexNumber();
 142   case '"': return LexString();
 143   case '$': return LexVarName();
 144   case '[': return LexBracket();
 145   case '!': return LexExclaim();
 146   }
 147 }
 148
 149 /// LexString - Lex "[^"]*"
 150 tgtok::TokKind TGLexer::LexString() {
 151   const char *StrStart = CurPtr;
 152
 153   CurStrVal = "";
 154
 155   while (*CurPtr != '"') {
 156     // If we hit the end of the buffer, report an error.
 157     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
 158       return ReturnError(StrStart, "End of file in string literal");
 159
 160     if (*CurPtr == '\n' || *CurPtr == '\r')
 161       return ReturnError(StrStart, "End of line in string literal");
 162
 163     if (*CurPtr != '\\') {
 164       CurStrVal += *CurPtr++;
 165       continue;
 166     }
 167
 168     ++CurPtr;
 169
 170     switch (*CurPtr) {
 171     case '\\': case '\'': case '"':
 172       // These turn into their literal character.
 173       CurStrVal += *CurPtr++;
 174       break;
 175     case 't':
 176       CurStrVal += '\t';
 177       ++CurPtr;
 178       break;
 179     case 'n':
 180       CurStrVal += '\n';
 181       ++CurPtr;
 182       break;
 183
 184     case '\n':
 185     case '\r':
 186       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
 187
 188     // If we hit the end of the buffer, report an error.
 189     case '\0':
 190       if (CurPtr == CurBuf->getBufferEnd())
 191         return ReturnError(StrStart, "End of file in string literal");
 192       // FALL THROUGH
 193     default:
 194       return ReturnError(CurPtr, "invalid escape in string literal");
 195     }
 196   }
 197
 198   ++CurPtr;
 199   return tgtok::StrVal;
 200 }
 201
 202 tgtok::TokKind TGLexer::LexVarName() {
 203   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
 204     return ReturnError(TokStart, "Invalid variable name");
 205
 206   // Otherwise, we're ok, consume the rest of the characters.
 207   const char *VarNameStart = CurPtr++;
 208
 209   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 210     ++CurPtr;
 211
 212   CurStrVal.assign(VarNameStart, CurPtr);
 213   return tgtok::VarName;
 214 }
 215
 216
 217 tgtok::TokKind TGLexer::LexIdentifier() {
 218   // The first letter is [a-zA-Z_#].
 219   const char *IdentStart = TokStart;
 220
 221   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
 222   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
 223          *CurPtr == '#')
 224     ++CurPtr;
 225
 226
 227   // Check to see if this identifier is a keyword.
 228   unsigned Len = CurPtr-IdentStart;
 229
 230   if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int;
 231   if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit;
 232   if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits;
 233   if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String;
 234   if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List;
 235   if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code;
 236   if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag;
 237
 238   if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class;
 239   if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def;
 240   if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm;
 241   if (Len == 10 && !memcmp(IdentStart, "multiclass", 10))
 242     return tgtok::MultiClass;
 243   if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field;
 244   if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let;
 245   if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In;
 246
 247   if (Len == 7 && !memcmp(IdentStart, "include", 7)) {
 248     if (LexInclude()) return tgtok::Error;
 249     return Lex();
 250   }
 251
 252   CurStrVal.assign(IdentStart, CurPtr);
 253   return tgtok::Id;
 254 }
 255
 256 /// LexInclude - We just read the "include" token.  Get the string token that
 257 /// comes next and enter the include.
 258 bool TGLexer::LexInclude() {
 259   // The token after the include must be a string.
 260   tgtok::TokKind Tok = LexToken();
 261   if (Tok == tgtok::Error) return true;
 262   if (Tok != tgtok::StrVal) {
 263     PrintError(getLoc(), "Expected filename after include");
 264     return true;
 265   }
 266
 267   // Get the string.
 268   std::string Filename = CurStrVal;
 269
 270
 271   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr));
 272   if (CurBuffer == -1) {
 273     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
 274     return true;
 275   }
 276
 277   // Save the line number and lex buffer of the includer.
 278   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
 279   CurPtr = CurBuf->getBufferStart();
 280   return false;
 281 }
 282
 283 void TGLexer::SkipBCPLComment() {
 284   ++CurPtr;  // skip the second slash.
 285   while (1) {
 286     switch (*CurPtr) {
 287     case '\n':
 288     case '\r':
 289       return;  // Newline is end of comment.
 290     case 0:
 291       // If this is the end of the buffer, end the comment.
 292       if (CurPtr == CurBuf->getBufferEnd())
 293         return;
 294       break;
 295     }
 296     // Otherwise, skip the character.
 297     ++CurPtr;
 298   }
 299 }
 300
 301 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 302 /// is that we allow nesting.
 303 bool TGLexer::SkipCComment() {
 304   ++CurPtr;  // skip the star.
 305   unsigned CommentDepth = 1;
 306
 307   while (1) {
 308     int CurChar = getNextChar();
 309     switch (CurChar) {
 310     case EOF:
 311       PrintError(TokStart, "Unterminated comment!");
 312       return true;
 313     case '*':
 314       // End of the comment?
 315       if (CurPtr[0] != '/') break;
 316
 317       ++CurPtr;   // End the */.
 318       if (--CommentDepth == 0)
 319         return false;
 320       break;
 321     case '/':
 322       // Start of a nested comment?
 323       if (CurPtr[0] != '*') break;
 324       ++CurPtr;
 325       ++CommentDepth;
 326       break;
 327     }
 328   }
 329 }
 330
 331 /// LexNumber - Lex:
 332 ///    [-+]?[0-9]+
 333 ///    0x[0-9a-fA-F]+
 334 ///    0b[01]+
 335 tgtok::TokKind TGLexer::LexNumber() {
 336   if (CurPtr[-1] == '0') {
 337     if (CurPtr[0] == 'x') {
 338       ++CurPtr;
 339       const char *NumStart = CurPtr;
 340       while (isxdigit(CurPtr[0]))
 341         ++CurPtr;
 342
 343       // Requires at least one hex digit.
 344       if (CurPtr == NumStart)
 345         return ReturnError(TokStart, "Invalid hexadecimal number");
 346
 347       errno = 0;
 348       CurIntVal = strtoll(NumStart, 0, 16);
 349       if (errno == EINVAL)
 350         return ReturnError(TokStart, "Invalid hexadecimal number");
 351       if (errno == ERANGE) {
 352         errno = 0;
 353         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
 354         if (errno == EINVAL)
 355           return ReturnError(TokStart, "Invalid hexadecimal number");
 356         if (errno == ERANGE)
 357           return ReturnError(TokStart, "Hexadecimal number out of range");
 358       }
 359       return tgtok::IntVal;
 360     } else if (CurPtr[0] == 'b') {
 361       ++CurPtr;
 362       const char *NumStart = CurPtr;
 363       while (CurPtr[0] == '0' || CurPtr[0] == '1')
 364         ++CurPtr;
 365
 366       // Requires at least one binary digit.
 367       if (CurPtr == NumStart)
 368         return ReturnError(CurPtr-2, "Invalid binary number");
 369       CurIntVal = strtoll(NumStart, 0, 2);
 370       return tgtok::IntVal;
 371     }
 372   }
 373
 374   // Check for a sign without a digit.
 375   if (!isdigit(CurPtr[0])) {
 376     if (CurPtr[-1] == '-')
 377       return tgtok::minus;
 378     else if (CurPtr[-1] == '+')
 379       return tgtok::plus;
 380   }
 381
 382   while (isdigit(CurPtr[0]))
 383     ++CurPtr;
 384   CurIntVal = strtoll(TokStart, 0, 10);
 385   return tgtok::IntVal;
 386 }
 387
 388 /// LexBracket - We just read '['.  If this is a code block, return it,
 389 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 390 tgtok::TokKind TGLexer::LexBracket() {
 391   if (CurPtr[0] != '{')
 392     return tgtok::l_square;
 393   ++CurPtr;
 394   const char *CodeStart = CurPtr;
 395   while (1) {
 396     int Char = getNextChar();
 397     if (Char == EOF) break;
 398
 399     if (Char != '}') continue;
 400
 401     Char = getNextChar();
 402     if (Char == EOF) break;
 403     if (Char == ']') {
 404       CurStrVal.assign(CodeStart, CurPtr-2);
 405       return tgtok::CodeFragment;
 406     }
 407   }
 408
 409   return ReturnError(CodeStart-2, "Unterminated Code Block");
 410 }
 411
 412 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 413 tgtok::TokKind TGLexer::LexExclaim() {
 414   if (!isalpha(*CurPtr))
 415     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
 416
 417   const char *Start = CurPtr++;
 418   while (isalpha(*CurPtr))
 419     ++CurPtr;
 420
 421   // Check to see which operator this is.
 422   switch (CurPtr - Start) {
 423   default:
 424     break;
 425   case 2:
 426     if (!memcmp(Start, "eq", 2)) return tgtok::XEq;
 427     if (!memcmp(Start, "if", 2)) return tgtok::XIf;
 428     break;
 429   case 3:
 430     if (!memcmp(Start, "car", 3)) return tgtok::XCar;
 431     if (!memcmp(Start, "cdr", 3)) return tgtok::XCdr;
 432     if (!memcmp(Start, "con", 3)) return tgtok::XConcat;
 433     if (!memcmp(Start, "shl", 3)) return tgtok::XSHL;
 434     if (!memcmp(Start, "sra", 3)) return tgtok::XSRA;
 435     if (!memcmp(Start, "srl", 3)) return tgtok::XSRL;
 436     break;
 437   case 4:
 438     if (!memcmp(Start, "cast", 4)) return tgtok::XCast;
 439     if (!memcmp(Start, "null", 4)) return tgtok::XNull;
 440     break;
 441   case 5:
 442     if (!memcmp(Start, "subst", 5)) return tgtok::XSubst;
 443     break;
 444   case 7:
 445     if (!memcmp(Start, "foreach", 7)) return tgtok::XForEach;
 446     break;
 447   case 9:
 448     if (!memcmp(Start, "strconcat", 9)) return tgtok::XStrConcat;
 449     break;
 450   }
 451
 452   return ReturnError(Start - 1, "Unknown operator");
 453 }
 454