utils/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // Implement the Lexer for TableGen.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "TGLexer.h"
  15 #include "llvm/Support/SourceMgr.h"
  16 #include "llvm/Support/Streams.h"
  17 #include "llvm/Support/MemoryBuffer.h"
  18 #include "llvm/Config/config.h"
  19 #include <cctype>
  20 #include <cstdio>
  21 #include <cstdlib>
  22 #include <cstring>
  23 #include <cerrno>
  24 using namespace llvm;
  25
  26 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
  27   CurBuffer = 0;
  28   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  29   CurPtr = CurBuf->getBufferStart();
  30   TokStart = 0;
  31 }
  32
  33 SMLoc TGLexer::getLoc() const {
  34   return SMLoc::getFromPointer(TokStart);
  35 }
  36
  37
  38 /// ReturnError - Set the error to the specified string at the specified
  39 /// location.  This is defined to always return tgtok::Error.
  40 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const std::string &Msg) {
  41   PrintError(Loc, Msg);
  42   return tgtok::Error;
  43 }
  44
  45
  46 void TGLexer::PrintError(const char *Loc, const std::string &Msg) const {
  47   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), Msg, "error");
  48 }
  49
  50 void TGLexer::PrintError(SMLoc Loc, const std::string &Msg) const {
  51   SrcMgr.PrintMessage(Loc, Msg, "error");
  52 }
  53
  54
  55 int TGLexer::getNextChar() {
  56   char CurChar = *CurPtr++;
  57   switch (CurChar) {
  58   default:
  59     return (unsigned char)CurChar;
  60   case 0: {
  61     // A nul character in the stream is either the end of the current buffer or
  62     // a random nul in the file.  Disambiguate that here.
  63     if (CurPtr-1 != CurBuf->getBufferEnd())
  64       return 0;  // Just whitespace.
  65
  66     // If this is the end of an included file, pop the parent file off the
  67     // include stack.
  68     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
  69     if (ParentIncludeLoc != SMLoc()) {
  70       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
  71       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  72       CurPtr = ParentIncludeLoc.getPointer();
  73       return getNextChar();
  74     }
  75
  76     // Otherwise, return end of file.
  77     --CurPtr;  // Another call to lex will return EOF again.
  78     return EOF;
  79   }
  80   case '\n':
  81   case '\r':
  82     // Handle the newline character by ignoring it and incrementing the line
  83     // count.  However, be careful about 'dos style' files with \n\r in them.
  84     // Only treat a \n\r or \r\n as a single line.
  85     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
  86         *CurPtr != CurChar)
  87       ++CurPtr;  // Eat the two char newline sequence.
  88     return '\n';
  89   }
  90 }
  91
  92 tgtok::TokKind TGLexer::LexToken() {
  93   TokStart = CurPtr;
  94   // This always consumes at least one character.
  95   int CurChar = getNextChar();
  96
  97   switch (CurChar) {
  98   default:
  99     // Handle letters: [a-zA-Z_]
 100     if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
 101       return LexIdentifier();
 102
 103     // Unknown character, emit an error.
 104     return ReturnError(TokStart, "Unexpected character");
 105   case EOF: return tgtok::Eof;
 106   case ':': return tgtok::colon;
 107   case ';': return tgtok::semi;
 108   case '.': return tgtok::period;
 109   case ',': return tgtok::comma;
 110   case '<': return tgtok::less;
 111   case '>': return tgtok::greater;
 112   case ']': return tgtok::r_square;
 113   case '{': return tgtok::l_brace;
 114   case '}': return tgtok::r_brace;
 115   case '(': return tgtok::l_paren;
 116   case ')': return tgtok::r_paren;
 117   case '=': return tgtok::equal;
 118   case '?': return tgtok::question;
 119
 120   case 0:
 121   case ' ':
 122   case '\t':
 123   case '\n':
 124   case '\r':
 125     // Ignore whitespace.
 126     return LexToken();
 127   case '/':
 128     // If this is the start of a // comment, skip until the end of the line or
 129     // the end of the buffer.
 130     if (*CurPtr == '/')
 131       SkipBCPLComment();
 132     else if (*CurPtr == '*') {
 133       if (SkipCComment())
 134         return tgtok::Error;
 135     } else // Otherwise, this is an error.
 136       return ReturnError(TokStart, "Unexpected character");
 137     return LexToken();
 138   case '-': case '+':
 139   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 140   case '7': case '8': case '9':
 141     return LexNumber();
 142   case '"': return LexString();
 143   case '$': return LexVarName();
 144   case '[': return LexBracket();
 145   case '!': return LexExclaim();
 146   }
 147 }
 148
 149 /// LexString - Lex "[^"]*"
 150 tgtok::TokKind TGLexer::LexString() {
 151   const char *StrStart = CurPtr;
 152
 153   CurStrVal = "";
 154
 155   while (*CurPtr != '"') {
 156     // If we hit the end of the buffer, report an error.
 157     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
 158       return ReturnError(StrStart, "End of file in string literal");
 159
 160     if (*CurPtr == '\n' || *CurPtr == '\r')
 161       return ReturnError(StrStart, "End of line in string literal");
 162
 163     if (*CurPtr != '\\') {
 164       CurStrVal += *CurPtr++;
 165       continue;
 166     }
 167
 168     ++CurPtr;
 169
 170     switch (*CurPtr) {
 171     case '\\': case '\'': case '"':
 172       // These turn into their literal character.
 173       CurStrVal += *CurPtr++;
 174       break;
 175     case 't':
 176       CurStrVal += '\t';
 177       ++CurPtr;
 178       break;
 179     case 'n':
 180       CurStrVal += '\n';
 181       ++CurPtr;
 182       break;
 183
 184     case '\n':
 185     case '\r':
 186       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
 187
 188     // If we hit the end of the buffer, report an error.
 189     case '\0':
 190       if (CurPtr == CurBuf->getBufferEnd())
 191         return ReturnError(StrStart, "End of file in string literal");
 192       // FALL THROUGH
 193     default:
 194       return ReturnError(CurPtr, "invalid escape in string literal");
 195     }
 196   }
 197
 198   ++CurPtr;
 199   return tgtok::StrVal;
 200 }
 201
 202 tgtok::TokKind TGLexer::LexVarName() {
 203   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
 204     return ReturnError(TokStart, "Invalid variable name");
 205
 206   // Otherwise, we're ok, consume the rest of the characters.
 207   const char *VarNameStart = CurPtr++;
 208
 209   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 210     ++CurPtr;
 211
 212   CurStrVal.assign(VarNameStart, CurPtr);
 213   return tgtok::VarName;
 214 }
 215
 216
 217 tgtok::TokKind TGLexer::LexIdentifier() {
 218   // The first letter is [a-zA-Z_].
 219   const char *IdentStart = TokStart;
 220
 221   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
 222   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_'
 223          || *CurPtr == '#') {
 224     // If this contains a '#', make sure it's value
 225     if (*CurPtr == '#') {
 226       if (strncmp(CurPtr, "#NAME#", 6) != 0) {
 227         return tgtok::Error;
 228       }
 229       CurPtr += 6;
 230     }
 231     else {
 232       ++CurPtr;
 233     }
 234   }
 235
 236
 237   // Check to see if this identifier is a keyword.
 238   unsigned Len = CurPtr-IdentStart;
 239
 240   if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int;
 241   if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit;
 242   if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits;
 243   if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String;
 244   if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List;
 245   if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code;
 246   if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag;
 247
 248   if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class;
 249   if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def;
 250   if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm;
 251   if (Len == 10 && !memcmp(IdentStart, "multiclass", 10))
 252     return tgtok::MultiClass;
 253   if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field;
 254   if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let;
 255   if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In;
 256
 257   if (Len == 7 && !memcmp(IdentStart, "include", 7)) {
 258     if (LexInclude()) return tgtok::Error;
 259     return Lex();
 260   }
 261
 262   CurStrVal.assign(IdentStart, CurPtr);
 263   return tgtok::Id;
 264 }
 265
 266 /// LexInclude - We just read the "include" token.  Get the string token that
 267 /// comes next and enter the include.
 268 bool TGLexer::LexInclude() {
 269   // The token after the include must be a string.
 270   tgtok::TokKind Tok = LexToken();
 271   if (Tok == tgtok::Error) return true;
 272   if (Tok != tgtok::StrVal) {
 273     PrintError(getLoc(), "Expected filename after include");
 274     return true;
 275   }
 276
 277   // Get the string.
 278   std::string Filename = CurStrVal;
 279
 280
 281   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr));
 282   if (CurBuffer == -1) {
 283     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
 284     return true;
 285   }
 286
 287   // Save the line number and lex buffer of the includer.
 288   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
 289   CurPtr = CurBuf->getBufferStart();
 290   return false;
 291 }
 292
 293 void TGLexer::SkipBCPLComment() {
 294   ++CurPtr;  // skip the second slash.
 295   while (1) {
 296     switch (*CurPtr) {
 297     case '\n':
 298     case '\r':
 299       return;  // Newline is end of comment.
 300     case 0:
 301       // If this is the end of the buffer, end the comment.
 302       if (CurPtr == CurBuf->getBufferEnd())
 303         return;
 304       break;
 305     }
 306     // Otherwise, skip the character.
 307     ++CurPtr;
 308   }
 309 }
 310
 311 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 312 /// is that we allow nesting.
 313 bool TGLexer::SkipCComment() {
 314   ++CurPtr;  // skip the star.
 315   unsigned CommentDepth = 1;
 316
 317   while (1) {
 318     int CurChar = getNextChar();
 319     switch (CurChar) {
 320     case EOF:
 321       PrintError(TokStart, "Unterminated comment!");
 322       return true;
 323     case '*':
 324       // End of the comment?
 325       if (CurPtr[0] != '/') break;
 326
 327       ++CurPtr;   // End the */.
 328       if (--CommentDepth == 0)
 329         return false;
 330       break;
 331     case '/':
 332       // Start of a nested comment?
 333       if (CurPtr[0] != '*') break;
 334       ++CurPtr;
 335       ++CommentDepth;
 336       break;
 337     }
 338   }
 339 }
 340
 341 /// LexNumber - Lex:
 342 ///    [-+]?[0-9]+
 343 ///    0x[0-9a-fA-F]+
 344 ///    0b[01]+
 345 tgtok::TokKind TGLexer::LexNumber() {
 346   if (CurPtr[-1] == '0') {
 347     if (CurPtr[0] == 'x') {
 348       ++CurPtr;
 349       const char *NumStart = CurPtr;
 350       while (isxdigit(CurPtr[0]))
 351         ++CurPtr;
 352
 353       // Requires at least one hex digit.
 354       if (CurPtr == NumStart)
 355         return ReturnError(TokStart, "Invalid hexadecimal number");
 356
 357       errno = 0;
 358       CurIntVal = strtoll(NumStart, 0, 16);
 359       if (errno == EINVAL)
 360         return ReturnError(TokStart, "Invalid hexadecimal number");
 361       if (errno == ERANGE) {
 362         errno = 0;
 363         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
 364         if (errno == EINVAL)
 365           return ReturnError(TokStart, "Invalid hexadecimal number");
 366         if (errno == ERANGE)
 367           return ReturnError(TokStart, "Hexadecimal number out of range");
 368       }
 369       return tgtok::IntVal;
 370     } else if (CurPtr[0] == 'b') {
 371       ++CurPtr;
 372       const char *NumStart = CurPtr;
 373       while (CurPtr[0] == '0' || CurPtr[0] == '1')
 374         ++CurPtr;
 375
 376       // Requires at least one binary digit.
 377       if (CurPtr == NumStart)
 378         return ReturnError(CurPtr-2, "Invalid binary number");
 379       CurIntVal = strtoll(NumStart, 0, 2);
 380       return tgtok::IntVal;
 381     }
 382   }
 383
 384   // Check for a sign without a digit.
 385   if (!isdigit(CurPtr[0])) {
 386     if (CurPtr[-1] == '-')
 387       return tgtok::minus;
 388     else if (CurPtr[-1] == '+')
 389       return tgtok::plus;
 390   }
 391
 392   while (isdigit(CurPtr[0]))
 393     ++CurPtr;
 394   CurIntVal = strtoll(TokStart, 0, 10);
 395   return tgtok::IntVal;
 396 }
 397
 398 /// LexBracket - We just read '['.  If this is a code block, return it,
 399 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 400 tgtok::TokKind TGLexer::LexBracket() {
 401   if (CurPtr[0] != '{')
 402     return tgtok::l_square;
 403   ++CurPtr;
 404   const char *CodeStart = CurPtr;
 405   while (1) {
 406     int Char = getNextChar();
 407     if (Char == EOF) break;
 408
 409     if (Char != '}') continue;
 410
 411     Char = getNextChar();
 412     if (Char == EOF) break;
 413     if (Char == ']') {
 414       CurStrVal.assign(CodeStart, CurPtr-2);
 415       return tgtok::CodeFragment;
 416     }
 417   }
 418
 419   return ReturnError(CodeStart-2, "Unterminated Code Block");
 420 }
 421
 422 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 423 tgtok::TokKind TGLexer::LexExclaim() {
 424   if (!isalpha(*CurPtr))
 425     return ReturnError(CurPtr-1, "Invalid \"!operator\"");
 426
 427   const char *Start = CurPtr++;
 428   while (isalpha(*CurPtr))
 429     ++CurPtr;
 430
 431   // Check to see which operator this is.
 432   unsigned Len = CurPtr-Start;
 433
 434   if (Len == 3  && !memcmp(Start, "con", 3)) return tgtok::XConcat;
 435   if (Len == 3  && !memcmp(Start, "sra", 3)) return tgtok::XSRA;
 436   if (Len == 3  && !memcmp(Start, "srl", 3)) return tgtok::XSRL;
 437   if (Len == 3  && !memcmp(Start, "shl", 3)) return tgtok::XSHL;
 438   if (Len == 9  && !memcmp(Start, "strconcat", 9))   return tgtok::XStrConcat;
 439   if (Len == 10 && !memcmp(Start, "nameconcat", 10)) return tgtok::XNameConcat;
 440   if (Len == 5 && !memcmp(Start, "subst", 5)) return tgtok::XSubst;
 441   if (Len == 7 && !memcmp(Start, "foreach", 7)) return tgtok::XForEach;
 442   if (Len == 4 && !memcmp(Start, "cast", 4)) return tgtok::XCast;
 443   if (Len == 3 && !memcmp(Start, "car", 3)) return tgtok::XCar;
 444   if (Len == 3 && !memcmp(Start, "cdr", 3)) return tgtok::XCdr;
 445   if (Len == 4 && !memcmp(Start, "null", 4)) return tgtok::XNull;
 446   if (Len == 2 && !memcmp(Start, "if", 2)) return tgtok::XIf;
 447
 448   return ReturnError(Start-1, "Unknown operator");
 449 }
 450