1 module source.lexbase; 2 3 mixin template LexBaseImpl(Token, alias BaseMap, alias KeywordMap, alias OperatorMap) { 4 // TODO: We shouldn't let consumer play with the internal state of the lexer. 5 // Instead, we should provide accessor to useful members. 6 // private: 7 Token t; 8 9 import source.location; 10 Position previous; 11 Position base; 12 13 uint index; 14 15 import std.bitmanip; 16 mixin(bitfields!( 17 bool, "tokenizeComments", 1, 18 bool, "_skipStrings", 1, 19 uint, "__derived", 30, 20 )); 21 22 import source.context; 23 Context context; 24 25 string content; 26 27 alias TokenRange = typeof(this); 28 alias TokenType = typeof(Token.init.type); 29 30 auto withComments(bool wc = true) { 31 auto r = this.save; 32 r.tokenizeComments = wc; 33 return r; 34 } 35 36 @property 37 bool decodeStrings() const { 38 return !_skipStrings; 39 } 40 41 auto withStringDecoding(bool sd = true) { 42 auto r = this.save; 43 r._skipStrings = !sd; 44 return r; 45 } 46 47 /** 48 * Return a copy of this lexer that: 49 * - skip over comments. 50 * - do not decode strings. 51 */ 52 auto getLookahead() { 53 return withStringDecoding(false).withComments(false); 54 } 55 56 @property 57 auto front() inout { 58 return t; 59 } 60 61 void popFront() in { 62 assert(front.type != TokenType.End); 63 } do { 64 previous = t.location.stop; 65 t = getNextToken(); 66 67 /+ 68 // Exprerience the token deluge ! 69 if (t.type != TokenType.End) { 70 import util.terminal, std.conv; 71 outputCaretDiagnostics( 72 t.location.getFullLocation(context), 73 to!string(t.type), 74 ); 75 } 76 // +/ 77 } 78 79 void moveTo(ref TokenRange fr) in { 80 assert(base is fr.base); 81 assert(context is fr.context); 82 assert(content is fr.content); 83 assert(index < fr.index); 84 } do { 85 index = fr.index; 86 t = fr.t; 87 } 88 89 @property 90 auto save() inout { 91 return this; 92 } 93 94 @property 95 bool empty() const { 96 return t.type == TokenType.End; 97 } 98 99 private: 100 enum Skippable = [" ", "\t", "\v", "\f", "\n", "\r", "\u2028", "\u2029"]; 101 102 auto getNextToken() { 103 static getLexerMap() { 104 auto ret = BaseMap; 105 106 foreach (op; Skippable) { 107 ret[op] = "-skip"; 108 } 109 110 foreach (kw, _; KeywordMap) { 111 ret[kw] = "lexKeyword"; 112 } 113 114 foreach (op, _; OperatorMap) { 115 ret[op] = "lexOperator"; 116 } 117 118 return ret; 119 } 120 121 while (true) { 122 import source.lexbase; 123 // pragma(msg, typeof(this)); 124 // pragma(msg, lexerMixin(getLexerMap())); 125 mixin(lexerMixin(getLexerMap())); 126 } 127 128 // Necessary because of https://issues.dlang.org/show_bug.cgi?id=22688 129 assert(0); 130 } 131 132 Token getError(uint begin, string message) { 133 Token t; 134 t.type = TokenType.Invalid; 135 t.name = context.getName(message); 136 t.location = base.getWithOffsets(begin, index); 137 return t; 138 } 139 140 void popChar() in { 141 assert(index < content.length); 142 } do { 143 index++; 144 } 145 146 void unpopChar() in { 147 assert(index > 1); 148 } do { 149 index--; 150 } 151 152 void popSkippableChars() { 153 static getLexerMap() { 154 string[string] ret; 155 156 foreach (op; Skippable) { 157 ret[op] = "-skip"; 158 } 159 160 return ret; 161 } 162 163 while (true) { 164 import source.lexbase; 165 // pragma(msg, typeof(this)); 166 // pragma(msg, lexerMixin(getLexerMap(), "__noop")); 167 mixin(lexerMixin(getLexerMap(), "skip")); 168 } 169 } 170 171 @property 172 char frontChar() const { 173 return content[index]; 174 } 175 176 auto skip(string s)() { 177 // Just skip over whitespace. 178 } 179 180 /** 181 * Identifiers. 182 */ 183 static wantIdentifier(char c) { 184 auto hc = c | 0x20; 185 return c == '_' || (c & 0x80) || (hc >= 'a' && hc <= 'z'); 186 } 187 188 auto popIdChars() { 189 const begin = index; 190 while (true) { 191 char c = frontChar; 192 193 import std.ascii : isAlphaNum; 194 while (c == '_' || isAlphaNum(c)) { 195 popChar(); 196 c = frontChar; 197 } 198 199 if (c < 0x80) { 200 break; 201 } 202 203 // This needs to be a size_t. 204 size_t i = index; 205 206 import std.utf; 207 auto u = content.decode(i); 208 209 import std.uni : isAlpha; 210 if (!isAlpha(u)) { 211 break; 212 } 213 214 index = cast(uint) i; 215 } 216 217 return begin - index; 218 } 219 220 auto lexIdentifier(string s : "" = "")() { 221 uint begin = index; 222 223 char c = frontChar; 224 if (wantIdentifier(c) && popIdChars() > 0) { 225 Token t; 226 t.type = TokenType.Identifier; 227 t.location = base.getWithOffsets(begin, index); 228 t.name = context.getName(content[begin .. index]); 229 230 return t; 231 } 232 233 // Make sure we don't stay in place. 234 if (c | 0x80) { 235 import std.utf; 236 size_t i = index; 237 content.decode(i); 238 index = cast(uint) i; 239 } else if (c != '\0') { 240 popChar(); 241 } 242 243 return getError(begin, "Unexpected token."); 244 } 245 246 auto lexIdentifier(string s)() if (s != "") { 247 uint l = s.length; 248 return lexIdentifier(index - l); 249 } 250 251 auto lexIdentifier(uint begin) { 252 popIdChars(); 253 254 Token t; 255 t.type = TokenType.Identifier; 256 t.location = base.getWithOffsets(begin, index); 257 t.name = context.getName(content[begin .. index]); 258 259 return t; 260 } 261 262 /** 263 * Operators. 264 */ 265 auto lexOperator(string s)() { 266 enum Type = OperatorMap[s]; 267 uint l = s.length; 268 269 Token t; 270 t.type = Type; 271 t.location = base.getWithOffsets(index - l, index); 272 t.name = BuiltinName!s; 273 274 return t; 275 } 276 277 /** 278 * Keywords. 279 */ 280 auto lexKeyword(string s)() { 281 enum Type = KeywordMap[s]; 282 uint l = s.length; 283 284 return lexKeyword(index - l, Type, BuiltinName!s); 285 } 286 287 import source.name; 288 auto lexKeyword(uint begin, TokenType type, Name keyword) { 289 auto idCharCount = popIdChars(); 290 291 Token t; 292 t.type = type; 293 t.name = keyword; 294 t.location = base.getWithOffsets(begin, index); 295 296 if (idCharCount == 0) { 297 return t; 298 } 299 300 // This is an identifier that happened to start 301 // like a keyword. 302 t.type = TokenType.Identifier; 303 t.name = context.getName(content[begin .. index]); 304 305 return t; 306 } 307 308 /** 309 * Utilities to handle literals suffixes. 310 */ 311 auto lexLiteralSuffix(alias Suffixes, alias CustomSuffixes = null)(uint begin) { 312 const prefixStart = index; 313 alias fun = lexLiteralSuffixTpl!Suffixes.fun; 314 315 static getLexerMap() { 316 string[string] ret = CustomSuffixes; 317 318 foreach (op, _; Suffixes) { 319 ret[op] = "fun"; 320 } 321 322 return ret; 323 } 324 325 while (true) { 326 import source.lexbase; 327 mixin(lexerMixin(getLexerMap(), "fun", ["begin", "prefixStart"])); 328 } 329 } 330 331 template lexLiteralSuffixTpl(alias Suffixes) { 332 auto fun(string s)(uint begin, uint prefixStart) { 333 enum Kind = Suffixes[s]; 334 auto idCharCount = popIdChars(); 335 336 if (idCharCount != 0) { 337 // We have something else. 338 return getError(prefixStart, "Invalid suffix: " ~ content[prefixStart .. index]); 339 } 340 341 Token t; 342 t.type = Kind; 343 t.location = base.getWithOffsets(begin, index); 344 345 return t; 346 } 347 } 348 349 /** 350 * Comments. 351 */ 352 uint popComment(string s)() { 353 auto c = frontChar; 354 355 static if (s == "//") { 356 // TODO: check for unicode line break. 357 while (c != '\n' && c != '\r') { 358 if (c == 0) { 359 return index; 360 } 361 362 popChar(); 363 c = frontChar; 364 } 365 366 uint ret = index; 367 368 popChar(); 369 if (c == '\r') { 370 if (frontChar == '\n') { 371 popChar(); 372 } 373 } 374 375 return ret; 376 } else static if (s == "/*") { 377 while (true) { 378 while (c != '*') { 379 popChar(); 380 c = frontChar; 381 } 382 383 auto match = c; 384 popChar(); 385 c = frontChar; 386 387 if (c == '/') { 388 popChar(); 389 return index; 390 } 391 } 392 } else static if (s == "/+") { 393 uint stack = 0; 394 while (true) { 395 while (c != '+' && c != '/') { 396 popChar(); 397 c = frontChar; 398 } 399 400 auto match = c; 401 popChar(); 402 c = frontChar; 403 404 switch (match) { 405 case '+' : 406 if (c == '/') { 407 popChar(); 408 if (!stack) { 409 return index; 410 } 411 412 c = frontChar; 413 stack--; 414 } 415 416 break; 417 418 case '/' : 419 if (c == '+') { 420 popChar(); 421 c = frontChar; 422 423 stack++; 424 } 425 426 break; 427 428 default : 429 assert(0, "Unreachable."); 430 } 431 } 432 } else { 433 static assert(0, s ~ " isn't a known type of comment."); 434 } 435 } 436 437 auto lexComment(string s)() { 438 Token t; 439 t.type = TokenType.Comment; 440 441 uint begin = index - uint(s.length); 442 uint end = popComment!s(); 443 444 t.location = base.getWithOffsets(begin, end); 445 return t; 446 } 447 } 448 449 @property 450 char front(string s) { 451 return s[0]; 452 } 453 454 void popFront(ref string s) { 455 s = s[1 .. $]; 456 } 457 458 string lexerMixin(string[string] ids, string def = "lexIdentifier", string[] rtArgs = []) { 459 return lexerMixin(ids, def, rtArgs, ""); 460 } 461 462 private: 463 464 auto stringify(string s) { 465 import std.array; 466 return "`" ~ s.replace("`", "` ~ \"`\" ~ `").replace("\0", "` ~ \"\\0\" ~ `") ~ "`"; 467 } 468 469 auto getLexingCode(string fun, string[] rtArgs, string base) { 470 import std.array; 471 auto args = "!(" ~ stringify(base) ~ ")(" ~ rtArgs.join(", ") ~ ")"; 472 473 switch (fun[0]) { 474 case '-': 475 return " 476 " ~ fun[1 .. $] ~ args ~ "; 477 continue;"; 478 479 case '?': 480 size_t i = 1; 481 while (fun[i] != ':') { 482 i++; 483 } 484 485 size_t endcond = i; 486 while (fun[i] != '|') { 487 i++; 488 } 489 490 auto cond = fun[1 .. endcond]; 491 auto lexCmd = fun[endcond + 1 .. i]; 492 auto skipCmd = fun[i + 1 .. $]; 493 494 return " 495 if (" ~ cond ~ ") { 496 return " ~ lexCmd ~ args ~ "; 497 } else { 498 " ~ skipCmd ~ args ~ "; 499 continue; 500 }"; 501 502 default: 503 return " 504 return " ~ fun ~ args ~ ";"; 505 } 506 } 507 508 string lexerMixin(string[string] ids, string def, string[] rtArgs, string base) { 509 auto defaultFun = def; 510 string[string][char] nextLevel; 511 foreach (id, fun; ids) { 512 if (id == "") { 513 defaultFun = fun; 514 } else { 515 nextLevel[id[0]][id[1 .. $]] = fun; 516 } 517 } 518 519 auto ret = " 520 switch(frontChar) {"; 521 522 foreach (c, subids; nextLevel) { 523 // TODO: have a real function to handle that. 524 string charLit; 525 switch(c) { 526 case '\0': 527 charLit = "\\0"; 528 break; 529 530 case '\'': 531 charLit = "\\'"; 532 break; 533 534 case '\t': 535 charLit = "\\t"; 536 break; 537 538 case '\v': 539 charLit = "\\v"; 540 break; 541 542 case '\f': 543 charLit = "\\f"; 544 break; 545 546 case '\n': 547 charLit = "\\n"; 548 break; 549 550 case '\r': 551 charLit = "\\r"; 552 break; 553 554 default: 555 if (c < 0x80) { 556 charLit = [c]; 557 break; 558 } 559 560 static char toHexChar(ubyte n) { 561 return ((n < 10) ? (n + '0') : (n - 10 + 'a')) & 0xff; 562 } 563 564 static string toHexString(ubyte c) { 565 return [toHexChar(c >> 4), toHexChar(c & 0x0f)]; 566 } 567 568 charLit = "\\x" ~ toHexString(c); 569 break; 570 } 571 572 ret ~= " 573 case '" ~ charLit ~ "': 574 popChar();"; 575 576 auto newBase = base ~ c; 577 if (subids.length == 1) { 578 if (auto cdef = "" in subids) { 579 ret ~= getLexingCode(*cdef, rtArgs, newBase); 580 continue; 581 } 582 } 583 584 ret ~= lexerMixin(nextLevel[c], def, rtArgs, newBase); 585 } 586 587 if (base == "" || base[$ - 1] < 0x80) { 588 ret ~= " 589 default:" ~ getLexingCode(defaultFun, rtArgs, base) ~ " 590 } 591 "; 592 } else { 593 ret ~= " 594 default: 595 // Do not exit in the middle of an unicode sequence. 596 unpopChar(); 597 break; 598 } 599 600 // Fall back to the default instead. 601 goto default; 602 "; 603 } 604 605 return ret; 606 }