1 module source.lexerutil; 2 3 mixin template TokenRangeImpl(Token, alias BaseMap, alias KeywordMap, alias OperatorMap) { 4 // TODO: We shouldn't let consumer play with the internal state of the lexer. 5 // Instead, we should provide accessor to useful members. 6 // private: 7 Token t; 8 9 import source.location; 10 Position previous; 11 Position base; 12 13 uint index; 14 15 import std.bitmanip; 16 mixin(bitfields!( 17 bool, "tokenizeComments", 1, 18 bool, "_skipStrings", 1, 19 uint, "__derived", 30, 20 )); 21 22 import source.context; 23 Context context; 24 25 string content; 26 27 alias TokenRange = typeof(this); 28 alias TokenType = typeof(Token.init.type); 29 30 auto withComments(bool wc = true) { 31 auto r = this.save; 32 r.tokenizeComments = wc; 33 return r; 34 } 35 36 @property 37 bool decodeStrings() const { 38 return !_skipStrings; 39 } 40 41 auto withStringDecoding(bool sd = true) { 42 auto r = this.save; 43 r._skipStrings = !sd; 44 return r; 45 } 46 47 /** 48 * Return a copy of this lexer that: 49 * - skip over comments. 50 * - do not decode strings. 51 */ 52 auto getLookahead() { 53 return withStringDecoding(false).withComments(false); 54 } 55 56 @property 57 auto front() inout { 58 return t; 59 } 60 61 void popFront() in { 62 assert(front.type != TokenType.End); 63 } do { 64 previous = t.location.stop; 65 t = getNextToken(); 66 67 /+ 68 // Exprerience the token deluge ! 69 if (t.type != TokenType.End) { 70 import util.terminal, std.conv; 71 outputCaretDiagnostics( 72 t.location.getFullLocation(context), 73 to!string(t.type), 74 ); 75 } 76 // +/ 77 } 78 79 void moveTo(ref TokenRange fr) in { 80 assert(base is fr.base); 81 assert(context is fr.context); 82 assert(content is fr.content); 83 assert(index < fr.index); 84 } do { 85 index = fr.index; 86 t = fr.t; 87 } 88 89 @property 90 auto save() inout { 91 return this; 92 } 93 94 @property 95 bool empty() const { 96 return t.type == TokenType.End; 97 } 98 99 private: 100 enum Skippable = [" ", "\t", "\v", "\f", "\n", "\r", "\u2028", "\u2029"]; 101 102 auto getNextToken() { 103 static getLexerMap() { 104 auto ret = BaseMap; 105 106 foreach (op; Skippable) { 107 ret[op] = "-skip"; 108 } 109 110 foreach (kw, _; KeywordMap) { 111 ret[kw] = "lexKeyword"; 112 } 113 114 foreach (op, _; OperatorMap) { 115 ret[op] = "lexOperator"; 116 } 117 118 return ret; 119 } 120 121 while (true) { 122 import source.lexerutil; 123 // pragma(msg, typeof(this)); 124 // pragma(msg, lexerMixin(getLexerMap())); 125 mixin(lexerMixin(getLexerMap())); 126 } 127 128 // Necessary because of https://issues.dlang.org/show_bug.cgi?id=22688 129 assert(0); 130 } 131 132 void setError(ref Token t, string message) { 133 t.type = TokenType.Invalid; 134 t.name = context.getName(message); 135 } 136 137 void popChar() in { 138 assert(index < content.length); 139 } do { 140 index++; 141 } 142 143 void unpopChar() in { 144 assert(index > 1); 145 } do { 146 index--; 147 } 148 149 void popSkippableChars() { 150 static getLexerMap() { 151 string[string] ret; 152 153 foreach (op; Skippable) { 154 ret[op] = "-skip"; 155 } 156 157 return ret; 158 } 159 160 while (true) { 161 import source.lexerutil; 162 // pragma(msg, typeof(this)); 163 // pragma(msg, lexerMixin(getLexerMap(), "__noop")); 164 mixin(lexerMixin(getLexerMap(), "skip")); 165 } 166 } 167 168 @property 169 char frontChar() const { 170 return content[index]; 171 } 172 173 auto skip(string s)() { 174 // Just skip over whitespace. 175 } 176 177 uint popComment(string s)() { 178 auto c = frontChar; 179 180 static if (s == "//") { 181 // TODO: check for unicode line break. 182 while (c != '\n' && c != '\r') { 183 if (c == 0) { 184 return index; 185 } 186 187 popChar(); 188 c = frontChar; 189 } 190 191 uint ret = index; 192 193 popChar(); 194 if (c == '\r') { 195 if (frontChar == '\n') { 196 popChar(); 197 } 198 } 199 200 return ret; 201 } else static if (s == "/*") { 202 while (true) { 203 while (c != '*') { 204 popChar(); 205 c = frontChar; 206 } 207 208 auto match = c; 209 popChar(); 210 c = frontChar; 211 212 if (c == '/') { 213 popChar(); 214 return index; 215 } 216 } 217 } else static if (s == "/+") { 218 uint stack = 0; 219 while (true) { 220 while (c != '+' && c != '/') { 221 popChar(); 222 c = frontChar; 223 } 224 225 auto match = c; 226 popChar(); 227 c = frontChar; 228 229 switch (match) { 230 case '+' : 231 if (c == '/') { 232 popChar(); 233 if (!stack) { 234 return index; 235 } 236 237 c = frontChar; 238 stack--; 239 } 240 241 break; 242 243 case '/' : 244 if (c == '+') { 245 popChar(); 246 c = frontChar; 247 248 stack++; 249 } 250 251 break; 252 253 default : 254 assert(0, "Unreachable."); 255 } 256 } 257 } else { 258 static assert(0, s ~ " isn't a known type of comment."); 259 } 260 } 261 262 auto lexComment(string s)() { 263 Token t; 264 t.type = TokenType.Comment; 265 266 uint begin = index - uint(s.length); 267 uint end = popComment!s(); 268 269 t.location = base.getWithOffsets(begin, end); 270 return t; 271 } 272 273 static wantIdentifier(char c) { 274 auto hc = c | 0x20; 275 return c == '_' || (c & 0x80) || (hc >= 'a' && hc <= 'z'); 276 } 277 278 auto lexIdentifier(string s : "" = "")() { 279 uint begin = index; 280 281 char c = frontChar; 282 if (!wantIdentifier(c)) { 283 // Make sure we don't stay in place. 284 if (c != '\0') { 285 popChar(); 286 } 287 288 Token t; 289 setError(t, "Unexpected token"); 290 t.location = base.getWithOffsets(begin, index); 291 return t; 292 } 293 294 if (c < 0x80) { 295 popChar(); 296 return lexIdentifier(1); 297 } 298 299 // XXX: Dafuq does this need to be a size_t ? 300 size_t i = index; 301 302 import std.utf; 303 auto u = content.decode(i); 304 index = cast(uint) i; 305 306 import std.uni; 307 if (!isAlpha(u)) { 308 Token t; 309 setError(t, "Unexpected token"); 310 t.location = base.getWithOffsets(begin, index); 311 return t; 312 } 313 314 return lexIdentifier(index - begin); 315 } 316 317 auto lexIdentifier(string s)() if (s != "") { 318 return lexIdentifier(s.length); 319 } 320 321 auto lexIdentifier(uint prefixLength) in { 322 assert(prefixLength > 0); 323 assert(index >= prefixLength); 324 } do { 325 Token t; 326 t.type = TokenType.Identifier; 327 immutable begin = index - prefixLength; 328 329 while (true) { 330 while (isIdChar(frontChar)) { 331 popChar(); 332 } 333 334 if (!(frontChar | 0x80)) { 335 break; 336 } 337 338 // XXX: Dafuq does this need to be a size_t ? 339 size_t i = index; 340 341 import std.utf; 342 auto u = content.decode(i); 343 344 import std.uni; 345 if (!isAlpha(u)) { 346 break; 347 } 348 349 index = cast(uint) i; 350 } 351 352 t.location = base.getWithOffsets(begin, index); 353 t.name = context.getName(content[begin .. index]); 354 355 return t; 356 } 357 358 bool lexEscapeSequence(ref string decoded) { 359 char c = frontChar; 360 361 switch (c) { 362 case '\'', '"', '\\': 363 // Noop. 364 break; 365 366 case '?': 367 assert(0, "WTF is \\?"); 368 369 case '0': 370 c = '\0'; 371 break; 372 373 case 'a': 374 c = '\a'; 375 break; 376 377 case 'b': 378 c = '\b'; 379 break; 380 381 case 'f': 382 c = '\f'; 383 break; 384 385 case 'r': 386 c = '\r'; 387 break; 388 389 case 'n': 390 c = '\n'; 391 break; 392 393 case 't': 394 c = '\t'; 395 break; 396 397 case 'v': 398 c = '\v'; 399 break; 400 401 case 'u', 'U': 402 popChar(); 403 404 uint v = 0; 405 406 auto length = 4 * (c == 'U') + 4; 407 foreach (i; 0 .. length) { 408 c = frontChar; 409 410 uint d = c - '0'; 411 uint h = ((c | 0x20) - 'a') & 0xff; 412 uint n = (d < 10) ? d : (h + 10); 413 414 if (n >= 16) { 415 return false; 416 } 417 418 v |= n << (4 * (length - i - 1)); 419 popChar(); 420 } 421 422 char[4] buf; 423 424 import std.utf; 425 auto i = encode(buf, v); 426 427 decoded ~= buf[0 .. i]; 428 return true; 429 430 case '&': 431 assert(0, "HTML5 named character references not implemented"); 432 433 default: 434 return false; 435 } 436 437 popChar(); 438 decoded ~= c; 439 return true; 440 } 441 442 Token lexRawString(char Delimiter = '`')(uint begin) { 443 Token t; 444 t.type = TokenType.StringLiteral; 445 446 size_t start = index; 447 448 auto c = frontChar; 449 while (c != Delimiter && c != '\0') { 450 popChar(); 451 c = frontChar; 452 } 453 454 if (c == '\0') { 455 setError(t, "Unexpected end of file"); 456 t.location = base.getWithOffsets(begin, index); 457 return t; 458 } 459 460 if (decodeStrings) { 461 string decoded = content[start .. index]; 462 t.name = context.getName(decoded); 463 } 464 465 popChar(); 466 467 t.location = base.getWithOffsets(begin, index); 468 return t; 469 } 470 471 Token lexString(string s : "`")() { 472 immutable begin = cast(uint) (index - s.length); 473 return lexRawString!'`'(begin); 474 } 475 476 Token lexString(string s : "'")() { 477 immutable begin = cast(uint) (index - s.length); 478 return lexRawString!'\''(begin); 479 } 480 481 Token lexDecodedString(char Delimiter = '"', TokenType TT = TokenType.StringLiteral)(uint begin) { 482 Token t; 483 t.type = TT; 484 485 size_t start = index; 486 string decoded; 487 488 auto c = frontChar; 489 while (c != Delimiter && c != '\0') { 490 if (c == '\\') { 491 immutable beginEscape = index; 492 493 if (decodeStrings) { 494 scope(success) { 495 start = index; 496 } 497 498 // Workaround for https://issues.dlang.org/show_bug.cgi?id=22271 499 if (decoded == "") { 500 decoded = content[start .. index]; 501 } else { 502 decoded ~= content[start .. index]; 503 } 504 505 popChar(); 506 if (!lexEscapeSequence(decoded)) { 507 t.location = base.getWithOffsets(beginEscape, index); 508 setError(t, "Invalid escape sequence"); 509 return t; 510 } 511 512 c = frontChar; 513 continue; 514 } 515 516 popChar(); 517 c = frontChar; 518 } 519 520 popChar(); 521 c = frontChar; 522 } 523 524 if (c == '\0') { 525 setError(t, "Unexpected end of file"); 526 t.location = base.getWithOffsets(begin, index); 527 return t; 528 } 529 530 if (decodeStrings) { 531 // Workaround for https://issues.dlang.org/show_bug.cgi?id=22271 532 if (decoded == "") { 533 decoded = content[start .. index]; 534 } else { 535 decoded ~= content[start .. index]; 536 } 537 538 t.name = context.getName(decoded); 539 } 540 541 popChar(); 542 543 t.location = base.getWithOffsets(begin, index); 544 return t; 545 } 546 547 Token lexString(string s : `"`)() { 548 immutable begin = cast(uint) (index - s.length); 549 return lexDecodedString!'"'(begin); 550 } 551 552 Token lexCharacter(string s : `'`)() { 553 immutable begin = cast(uint) (index - s.length); 554 return lexDecodedString!('\'', TokenType.CharacterLiteral)(begin); 555 } 556 557 /** 558 * General integer lexing utilities. 559 */ 560 Token lexIntegralSuffix(uint begin) { 561 Token t; 562 t.type = TokenType.IntegerLiteral; 563 564 auto c = frontChar; 565 switch(c | 0x20) { 566 case 'u': 567 popChar(); 568 569 c = frontChar; 570 if (c == 'L' || c == 'l') { 571 popChar(); 572 } 573 574 break; 575 576 case 'l': 577 popChar(); 578 579 c = frontChar; 580 if (c == 'U' || c == 'u') { 581 popChar(); 582 } 583 584 break; 585 586 case 'f': 587 popChar(); 588 589 t.type = TokenType.FloatLiteral; 590 break; 591 592 default: 593 break; 594 } 595 596 t.location = base.getWithOffsets(begin, index); 597 return t; 598 } 599 600 Token lexFloatSuffix(uint begin) { 601 Token t; 602 603 const c = frontChar; 604 const hc = c | 0x20; 605 if (hc == 'f' || hc == 'l') { 606 popChar(); 607 } 608 609 t.location = base.getWithOffsets(begin, index); 610 611 // l is an error for some unexplainable reason. 612 if (c == 'l') { 613 setError(t, "Use 'L' suffix instead of 'l'"); 614 return t; 615 } 616 617 t.type = TokenType.FloatLiteral; 618 return t; 619 } 620 621 Token lexFloatLiteral(alias isFun, alias popFun, char E)(uint begin) { 622 popFun(); 623 624 bool isFloat = false; 625 if (frontChar == '.') { 626 auto savePoint = index; 627 628 popChar(); 629 if (frontChar == '.') { 630 index = savePoint; 631 goto LexSuffix; 632 } 633 634 auto floatSavePoint = index; 635 636 popSkippableChars(); 637 638 if (wantIdentifier(frontChar)) { 639 index = savePoint; 640 goto LexSuffix; 641 } 642 643 index = floatSavePoint; 644 isFloat = true; 645 646 if (isFun(frontChar)) { 647 popChar(); 648 popFun(); 649 } 650 } 651 652 if ((frontChar | 0x20) == E) { 653 isFloat = true; 654 popChar(); 655 656 auto c = frontChar; 657 if (c == '+' || c == '-') { 658 popChar(); 659 } 660 661 popFun(); 662 } 663 664 LexSuffix: 665 return isFloat ? lexFloatSuffix(begin) : lexIntegralSuffix(begin); 666 } 667 668 /** 669 * Binary literals. 670 */ 671 static bool isBinary(char c) { 672 return c == '0' || c == '1'; 673 } 674 675 void popBinary() { 676 auto c = frontChar; 677 while (isBinary(c) || c == '_') { 678 popChar(); 679 c = frontChar; 680 } 681 } 682 683 Token lexNumeric(string s : "0B")() { 684 return lexNumeric!"0b"(); 685 } 686 687 Token lexNumeric(string s : "0b")() { 688 uint begin = index - 2; 689 690 while (frontChar == '_') { 691 popChar(); 692 } 693 694 if (!isBinary(frontChar)) { 695 Token t; 696 t.location = base.getWithOffsets(begin, index); 697 setError(t, "Invalid binary sequence"); 698 return t; 699 } 700 701 popBinary(); 702 return lexIntegralSuffix(begin); 703 } 704 705 /** 706 * Hexadecimal literals. 707 */ 708 static bool isHexadecimal(char c) { 709 auto hc = c | 0x20; 710 return (c >= '0' && c <= '9') || (hc >= 'a' && hc <= 'f'); 711 } 712 713 void popHexadecimal() { 714 auto c = frontChar; 715 while (isHexadecimal(c) || c == '_') { 716 popChar(); 717 c = frontChar; 718 } 719 } 720 721 Token lexNumeric(string s : "0X")() { 722 return lexNumeric!"0x"(); 723 } 724 725 Token lexNumeric(string s : "0x")() { 726 uint begin = index - 2; 727 728 while (frontChar == '_') { 729 popChar(); 730 } 731 732 if (!isHexadecimal(frontChar)) { 733 Token t; 734 t.location = base.getWithOffsets(begin, index); 735 setError(t, "Invalid hexadecimal sequence"); 736 return t; 737 } 738 739 return lexFloatLiteral!(isHexadecimal, popHexadecimal, 'p')(begin); 740 } 741 742 /** 743 * Decimal literals. 744 */ 745 static bool isDecimal(char c) { 746 return c >= '0' && c <= '9'; 747 } 748 749 void popDecimal() { 750 auto c = frontChar; 751 while (isDecimal(c) || c == '_') { 752 popChar(); 753 c = frontChar; 754 } 755 } 756 757 auto lexNumeric(string s)() if (s.length == 1 && isDecimal(s[0])) { 758 return lexNumeric(s[0]); 759 } 760 761 auto lexNumeric(char c) in { 762 assert(isDecimal(c)); 763 } do { 764 return lexFloatLiteral!(isDecimal, popDecimal, 'e')(index - 1); 765 } 766 767 /** 768 * Keywords and identifiers. 769 */ 770 auto lexKeyword(string s)() { 771 auto c = frontChar; 772 if (isIdChar(c)) { 773 popChar(); 774 return lexIdentifier(s.length + 1); 775 } 776 777 if (c & 0x80) { 778 size_t i = index; 779 780 import std.utf; 781 auto u = content.decode(i); 782 783 import std.uni; 784 if (isAlpha(u)) { 785 auto l = cast(ubyte) (i - index); 786 index += l; 787 return lexIdentifier(s.length + l); 788 } 789 } 790 791 enum Type = KeywordMap[s]; 792 793 uint l = s.length; 794 795 Token t; 796 t.type = Type; 797 t.location = base.getWithOffsets(index - l, index); 798 799 import source.name; 800 t.name = BuiltinName!s; 801 802 return t; 803 } 804 805 auto lexOperator(string s)() { 806 enum Type = OperatorMap[s]; 807 808 uint l = s.length; 809 810 Token t; 811 t.type = Type; 812 t.location = base.getWithOffsets(index - l, index); 813 814 import source.name; 815 t.name = BuiltinName!s; 816 817 return t; 818 } 819 } 820 821 @property 822 char front(string s) { 823 return s[0]; 824 } 825 826 void popFront(ref string s) { 827 s = s[1 .. $]; 828 } 829 830 auto isIdChar(char c) { 831 import std.ascii; 832 return c == '_' || isAlphaNum(c); 833 } 834 835 string lexerMixin(string[string] ids, string def = "lexIdentifier") { 836 return lexerMixin(ids, def, ""); 837 } 838 839 private: 840 841 auto stringify(string s) { 842 import std.array; 843 return "`" ~ s.replace("`", "` ~ \"`\" ~ `").replace("\0", "` ~ \"\\0\" ~ `") ~ "`"; 844 } 845 846 auto getLexingCode(string fun, string base) { 847 auto args = "!(" ~ stringify(base) ~ ")()"; 848 849 switch (fun[0]) { 850 case '-': 851 return " 852 " ~ fun[1 .. $] ~ args ~ "; 853 continue;"; 854 855 case '?': 856 size_t i = 1; 857 while (fun[i] != ':') { 858 i++; 859 } 860 861 size_t endcond = i; 862 while (fun[i] != '|') { 863 i++; 864 } 865 866 auto cond = fun[1 .. endcond]; 867 auto lexCmd = fun[endcond + 1 .. i]; 868 auto skipCmd = fun[i + 1 .. $]; 869 870 return " 871 if (" ~ cond ~ ") { 872 return " ~ lexCmd ~ args ~ "; 873 } else { 874 " ~ skipCmd ~ args ~ "; 875 continue; 876 }"; 877 878 default: 879 return " 880 return " ~ fun ~ args ~ ";"; 881 } 882 } 883 884 string lexerMixin(string[string] ids, string def, string base) { 885 auto defaultFun = def; 886 string[string][char] nextLevel; 887 foreach (id, fun; ids) { 888 if (id == "") { 889 defaultFun = fun; 890 } else { 891 nextLevel[id[0]][id[1 .. $]] = fun; 892 } 893 } 894 895 auto ret = " 896 switch(frontChar) {"; 897 898 foreach (c, subids; nextLevel) { 899 // TODO: have a real function to handle that. 900 string charLit; 901 switch(c) { 902 case '\0': 903 charLit = "\\0"; 904 break; 905 906 case '\'': 907 charLit = "\\'"; 908 break; 909 910 case '\t': 911 charLit = "\\t"; 912 break; 913 914 case '\v': 915 charLit = "\\v"; 916 break; 917 918 case '\f': 919 charLit = "\\f"; 920 break; 921 922 case '\n': 923 charLit = "\\n"; 924 break; 925 926 case '\r': 927 charLit = "\\r"; 928 break; 929 930 default: 931 if (c < 0x80) { 932 charLit = [c]; 933 break; 934 } 935 936 static char toHexChar(ubyte n) { 937 return ((n < 10) ? (n + '0') : (n - 10 + 'a')) & 0xff; 938 } 939 940 static string toHexString(ubyte c) { 941 return [toHexChar(c >> 4), toHexChar(c & 0x0f)]; 942 } 943 944 charLit = "\\x" ~ toHexString(c); 945 break; 946 } 947 948 ret ~= " 949 case '" ~ charLit ~ "': 950 popChar();"; 951 952 auto newBase = base ~ c; 953 if (subids.length == 1) { 954 if (auto cdef = "" in subids) { 955 ret ~= getLexingCode(*cdef, newBase); 956 continue; 957 } 958 } 959 960 ret ~= lexerMixin(nextLevel[c], def, newBase); 961 } 962 963 if (base == "" || base[$ - 1] < 0x80) { 964 ret ~= " 965 default:" ~ getLexingCode(defaultFun, base) ~ " 966 } 967 "; 968 } else { 969 ret ~= " 970 default: 971 // Do not exit in the middle of an unicode sequence. 972 unpopChar(); 973 break; 974 } 975 976 // Fall back to the default instead. 977 goto default; 978 "; 979 } 980 981 return ret; 982 }