utils/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // Implement the Lexer for TableGen.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "TGLexer.h"
  15 #include "llvm/Support/Streams.h"
  16 #include "llvm/Support/MemoryBuffer.h"
  17 #include <ostream>
  18 #include "llvm/Config/config.h"
  19 #include <cctype>
  20 #include <cstdio>
  21 #include <cstdlib>
  22 #include <cstring>
  23 #include <cerrno>
  24 using namespace llvm;
  25
  26 TGLexer::TGLexer(MemoryBuffer *StartBuf) : CurLineNo(1), CurBuf(StartBuf) {
  27   CurPtr = CurBuf->getBufferStart();
  28   TokStart = 0;
  29 }
  30
  31 TGLexer::~TGLexer() {
  32   while (!IncludeStack.empty()) {
  33     delete IncludeStack.back().Buffer;
  34     IncludeStack.pop_back();
  35   }
  36   delete CurBuf;
  37 }
  38
  39 /// ReturnError - Set the error to the specified string at the specified
  40 /// location.  This is defined to always return tgtok::Error.
  41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const std::string &Msg) {
  42   PrintError(Loc, Msg);
  43   return tgtok::Error;
  44 }
  45
  46 void TGLexer::PrintIncludeStack(std::ostream &OS) const {
  47   for (unsigned i = 0, e = IncludeStack.size(); i != e; ++i)
  48     OS << "Included from " << IncludeStack[i].Buffer->getBufferIdentifier()
  49        << ":" << IncludeStack[i].LineNo << ":\n";
  50   OS << "Parsing " << CurBuf->getBufferIdentifier() << ":"
  51      << CurLineNo << ": ";
  52 }
  53
  54 /// PrintError - Print the error at the specified location.
  55 void TGLexer::PrintError(const char *ErrorLoc,  const std::string &Msg) const {
  56   PrintIncludeStack(*cerr.stream());
  57   cerr << Msg << "\n";
  58   assert(ErrorLoc && "Location not specified!");
  59
  60   // Scan backward to find the start of the line.
  61   const char *LineStart = ErrorLoc;
  62   while (LineStart != CurBuf->getBufferStart() &&
  63          LineStart[-1] != '\n' && LineStart[-1] != '\r')
  64     --LineStart;
  65   // Get the end of the line.
  66   const char *LineEnd = ErrorLoc;
  67   while (LineEnd != CurBuf->getBufferEnd() &&
  68          LineEnd[0] != '\n' && LineEnd[0] != '\r')
  69     ++LineEnd;
  70   // Print out the line.
  71   cerr << std::string(LineStart, LineEnd) << "\n";
  72   // Print out spaces before the carat.
  73   for (const char *Pos = LineStart; Pos != ErrorLoc; ++Pos)
  74     cerr << (*Pos == '\t' ? '\t' : ' ');
  75   cerr << "^\n";
  76 }
  77
  78 int TGLexer::getNextChar() {
  79   char CurChar = *CurPtr++;
  80   switch (CurChar) {
  81   default:
  82     return (unsigned char)CurChar;
  83   case 0:
  84     // A nul character in the stream is either the end of the current buffer or
  85     // a random nul in the file.  Disambiguate that here.
  86     if (CurPtr-1 != CurBuf->getBufferEnd())
  87       return 0;  // Just whitespace.
  88
  89     // If this is the end of an included file, pop the parent file off the
  90     // include stack.
  91     if (!IncludeStack.empty()) {
  92       delete CurBuf;
  93       CurBuf = IncludeStack.back().Buffer;
  94       CurLineNo = IncludeStack.back().LineNo;
  95       CurPtr = IncludeStack.back().CurPtr;
  96       IncludeStack.pop_back();
  97       return getNextChar();
  98     }
  99
 100     // Otherwise, return end of file.
 101     --CurPtr;  // Another call to lex will return EOF again.
 102     return EOF;
 103   case '\n':
 104   case '\r':
 105     // Handle the newline character by ignoring it and incrementing the line
 106     // count.  However, be careful about 'dos style' files with \n\r in them.
 107     // Only treat a \n\r or \r\n as a single line.
 108     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
 109         *CurPtr != CurChar)
 110       ++CurPtr;  // Eat the two char newline sequence.
 111
 112     ++CurLineNo;
 113     return '\n';
 114   }
 115 }
 116
 117 tgtok::TokKind TGLexer::LexToken() {
 118   TokStart = CurPtr;
 119   // This always consumes at least one character.
 120   int CurChar = getNextChar();
 121
 122   switch (CurChar) {
 123   default:
 124     // Handle letters: [a-zA-Z_]
 125     if (isalpha(CurChar) || CurChar == '_')
 126       return LexIdentifier();
 127
 128     // Unknown character, emit an error.
 129     return ReturnError(TokStart, "Unexpected character");
 130   case EOF: return tgtok::Eof;
 131   case ':': return tgtok::colon;
 132   case ';': return tgtok::semi;
 133   case '.': return tgtok::period;
 134   case ',': return tgtok::comma;
 135   case '<': return tgtok::less;
 136   case '>': return tgtok::greater;
 137   case ']': return tgtok::r_square;
 138   case '{': return tgtok::l_brace;
 139   case '}': return tgtok::r_brace;
 140   case '(': return tgtok::l_paren;
 141   case ')': return tgtok::r_paren;
 142   case '=': return tgtok::equal;
 143   case '?': return tgtok::question;
 144
 145   case 0:
 146   case ' ':
 147   case '\t':
 148   case '\n':
 149   case '\r':
 150     // Ignore whitespace.
 151     return LexToken();
 152   case '/':
 153     // If this is the start of a // comment, skip until the end of the line or
 154     // the end of the buffer.
 155     if (*CurPtr == '/')
 156       SkipBCPLComment();
 157     else if (*CurPtr == '*') {
 158       if (SkipCComment())
 159         return tgtok::Error;
 160     } else // Otherwise, this is an error.
 161       return ReturnError(TokStart, "Unexpected character");
 162     return LexToken();
 163   case '-': case '+':
 164   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 165   case '7': case '8': case '9':
 166     return LexNumber();
 167   case '"': return LexString();
 168   case '$': return LexVarName();
 169   case '[': return LexBracket();
 170   case '!': return LexExclaim();
 171   }
 172 }
 173
 174 /// LexString - Lex "[^"]*"
 175 tgtok::TokKind TGLexer::LexString() {
 176   const char *StrStart = CurPtr;
 177
 178   while (*CurPtr != '"') {
 179     // If we hit the end of the buffer, report an error.
 180     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
 181       return ReturnError(StrStart, "End of file in string literal");
 182
 183     if (*CurPtr == '\n' || *CurPtr == '\r')
 184       return ReturnError(StrStart, "End of line in string literal");
 185
 186     ++CurPtr;
 187   }
 188
 189   CurStrVal.assign(StrStart, CurPtr);
 190   ++CurPtr;
 191   return tgtok::StrVal;
 192 }
 193
 194 tgtok::TokKind TGLexer::LexVarName() {
 195   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
 196     return ReturnError(TokStart, "Invalid variable name");
 197
 198   // Otherwise, we're ok, consume the rest of the characters.
 199   const char *VarNameStart = CurPtr++;
 200
 201   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 202     ++CurPtr;
 203
 204   CurStrVal.assign(VarNameStart, CurPtr);
 205   return tgtok::VarName;
 206 }
 207
 208
 209 tgtok::TokKind TGLexer::LexIdentifier() {
 210   // The first letter is [a-zA-Z_].
 211   const char *IdentStart = TokStart;
 212
 213   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
 214   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 215     ++CurPtr;
 216
 217   // Check to see if this identifier is a keyword.
 218   unsigned Len = CurPtr-IdentStart;
 219
 220   if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int;
 221   if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit;
 222   if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits;
 223   if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String;
 224   if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List;
 225   if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code;
 226   if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag;
 227
 228   if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class;
 229   if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def;
 230   if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm;
 231   if (Len == 10 && !memcmp(IdentStart, "multiclass", 10))
 232     return tgtok::MultiClass;
 233   if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field;
 234   if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let;
 235   if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In;
 236
 237   if (Len == 7 && !memcmp(IdentStart, "include", 7)) {
 238     if (LexInclude()) return tgtok::Error;
 239     return Lex();
 240   }
 241
 242   CurStrVal.assign(IdentStart, CurPtr);
 243   return tgtok::Id;
 244 }
 245
 246 /// LexInclude - We just read the "include" token.  Get the string token that
 247 /// comes next and enter the include.
 248 bool TGLexer::LexInclude() {
 249   // The token after the include must be a string.
 250   tgtok::TokKind Tok = LexToken();
 251   if (Tok == tgtok::Error) return true;
 252   if (Tok != tgtok::StrVal) {
 253     PrintError(getLoc(), "Expected filename after include");
 254     return true;
 255   }
 256
 257   // Get the string.
 258   std::string Filename = CurStrVal;
 259
 260   // Try to find the file.
 261   MemoryBuffer *NewBuf = MemoryBuffer::getFile(Filename.c_str());
 262
 263   // If the file didn't exist directly, see if it's in an include path.
 264   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
 265     std::string IncFile = IncludeDirectories[i] + "/" + Filename;
 266     NewBuf = MemoryBuffer::getFile(IncFile.c_str());
 267   }
 268
 269   if (NewBuf == 0) {
 270     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
 271     return true;
 272   }
 273
 274   // Save the line number and lex buffer of the includer.
 275   IncludeStack.push_back(IncludeRec(CurBuf, CurPtr, CurLineNo));
 276
 277   CurLineNo = 1;  // Reset line numbering.
 278   CurBuf = NewBuf;
 279   CurPtr = CurBuf->getBufferStart();
 280   return false;
 281 }
 282
 283 void TGLexer::SkipBCPLComment() {
 284   ++CurPtr;  // skip the second slash.
 285   while (1) {
 286     switch (*CurPtr) {
 287     case '\n':
 288     case '\r':
 289       return;  // Newline is end of comment.
 290     case 0:
 291       // If this is the end of the buffer, end the comment.
 292       if (CurPtr == CurBuf->getBufferEnd())
 293         return;
 294       break;
 295     }
 296     // Otherwise, skip the character.
 297     ++CurPtr;
 298   }
 299 }
 300
 301 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 302 /// is that we allow nesting.
 303 bool TGLexer::SkipCComment() {
 304   ++CurPtr;  // skip the star.
 305   unsigned CommentDepth = 1;
 306
 307   while (1) {
 308     int CurChar = getNextChar();
 309     switch (CurChar) {
 310     case EOF:
 311       PrintError(TokStart, "Unterminated comment!");
 312       return true;
 313     case '*':
 314       // End of the comment?
 315       if (CurPtr[0] != '/') break;
 316
 317       ++CurPtr;   // End the */.
 318       if (--CommentDepth == 0)
 319         return false;
 320       break;
 321     case '/':
 322       // Start of a nested comment?
 323       if (CurPtr[0] != '*') break;
 324       ++CurPtr;
 325       ++CommentDepth;
 326       break;
 327     }
 328   }
 329 }
 330
 331 /// LexNumber - Lex:
 332 ///    [-+]?[0-9]+
 333 ///    0x[0-9a-fA-F]+
 334 ///    0b[01]+
 335 tgtok::TokKind TGLexer::LexNumber() {
 336   if (CurPtr[-1] == '0') {
 337     if (CurPtr[0] == 'x') {
 338       ++CurPtr;
 339       const char *NumStart = CurPtr;
 340       while (isxdigit(CurPtr[0]))
 341         ++CurPtr;
 342
 343       // Requires at least one hex digit.
 344       if (CurPtr == NumStart)
 345         return ReturnError(CurPtr-2, "Invalid hexadecimal number");
 346
 347       errno = 0;
 348       CurIntVal = strtoll(NumStart, 0, 16);
 349       if (errno == EINVAL)
 350         return ReturnError(CurPtr-2, "Invalid hexadecimal number");
 351       if (errno == ERANGE) {
 352         errno = 0;
 353         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
 354         if (errno == EINVAL)
 355           return ReturnError(CurPtr-2, "Invalid hexadecimal number");
 356         if (errno == ERANGE)
 357           return ReturnError(CurPtr-2, "Hexadecimal number out of range");
 358       }
 359       return tgtok::IntVal;
 360     } else if (CurPtr[0] == 'b') {
 361       ++CurPtr;
 362       const char *NumStart = CurPtr;
 363       while (CurPtr[0] == '0' || CurPtr[0] == '1')
 364         ++CurPtr;
 365
 366       // Requires at least one binary digit.
 367       if (CurPtr == NumStart)
 368         return ReturnError(CurPtr-2, "Invalid binary number");
 369       CurIntVal = strtoll(NumStart, 0, 2);
 370       return tgtok::IntVal;
 371     }
 372   }
 373
 374   // Check for a sign without a digit.
 375   if (!isdigit(CurPtr[0])) {
 376     if (CurPtr[-1] == '-')
 377       return tgtok::minus;
 378     else if (CurPtr[-1] == '+')
 379       return tgtok::plus;
 380   }
 381
 382   while (isdigit(CurPtr[0]))
 383     ++CurPtr;
 384   CurIntVal = strtoll(TokStart, 0, 10);
 385   return tgtok::IntVal;
 386 }
 387
 388 /// LexBracket - We just read '['.  If this is a code block, return it,
 389 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 390 tgtok::TokKind TGLexer::LexBracket() {
 391   if (CurPtr[0] != '{')
 392     return tgtok::l_square;
 393   ++CurPtr;
 394   const char *CodeStart = CurPtr;
 395   while (1) {
 396     int Char = getNextChar();
 397     if (Char == EOF) break;
 398
 399     if (Char != '}') continue;
 400
 401     Char = getNextChar();
 402     if (Char == EOF) break;
 403     if (Char == ']') {
 404       CurStrVal.assign(CodeStart, CurPtr-2);
 405       return tgtok::CodeFragment;
 406     }
 407   }
 408
 409   return ReturnError(CodeStart-2, "Unterminated Code Block");
 410 }
 411
 412 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 413 tgtok::TokKind TGLexer::LexExclaim() {
 414   if (!isalpha(*CurPtr))
 415     return ReturnError(CurPtr-1, "Invalid \"!operator\"");
 416
 417   const char *Start = CurPtr++;
 418   while (isalpha(*CurPtr))
 419     ++CurPtr;
 420
 421   // Check to see which operator this is.
 422   unsigned Len = CurPtr-Start;
 423
 424   if (Len == 3 && !memcmp(Start, "con", 3)) return tgtok::XConcat;
 425   if (Len == 3 && !memcmp(Start, "sra", 3)) return tgtok::XSRA;
 426   if (Len == 3 && !memcmp(Start, "srl", 3)) return tgtok::XSRL;
 427   if (Len == 3 && !memcmp(Start, "shl", 3)) return tgtok::XSHL;
 428   if (Len == 9 && !memcmp(Start, "strconcat", 9)) return tgtok::XStrConcat;
 429
 430   return ReturnError(Start-1, "Unknown operator");
 431 }
 432