1 module source.lexerutil;
2
3 mixin template TokenRangeImpl(Token, alias BaseMap, alias KeywordMap, alias OperatorMap) {
4 // TODO: We shouldn't let consumer play with the internal state of the lexer.
5 // Instead, we should provide accessor to useful members.
6 // private:
7 Token t;
8
9 import source.location;
10 Position previous;
11 Position base;
12
13 uint index;
14
15 import std.bitmanip;
16 mixin(bitfields!(
17 bool, "tokenizeComments", 1,
18 bool, "_skipStrings", 1,
19 uint, "__derived", 30,
20 ));
21
22 import source.context;
23 Context context;
24
25 string content;
26
27 alias TokenRange = typeof(this);
28 alias TokenType = typeof(Token.init.type);
29
30 auto withComments(bool wc = true) {
31 auto r = this.save;
32 r.tokenizeComments = wc;
33 return r;
34 }
35
36 @property
37 bool decodeStrings() const {
38 return !_skipStrings;
39 }
40
41 auto withStringDecoding(bool sd = true) {
42 auto r = this.save;
43 r._skipStrings = !sd;
44 return r;
45 }
46
47 /**
48 * Return a copy of this lexer that:
49 * - skip over comments.
50 * - do not decode strings.
51 */
52 auto getLookahead() {
53 return withStringDecoding(false).withComments(false);
54 }
55
56 @property
57 auto front() inout {
58 return t;
59 }
60
61 void popFront() in {
62 assert(front.type != TokenType.End);
63 } do {
64 previous = t.location.stop;
65 t = getNextToken();
66
67 /+
68 // Exprerience the token deluge !
69 if (t.type != TokenType.End) {
70 import util.terminal, std.conv;
71 outputCaretDiagnostics(
72 t.location.getFullLocation(context),
73 to!string(t.type),
74 );
75 }
76 // +/
77 }
78
79 void moveTo(ref TokenRange fr) in {
80 assert(base is fr.base);
81 assert(context is fr.context);
82 assert(content is fr.content);
83 assert(index < fr.index);
84 } do {
85 index = fr.index;
86 t = fr.t;
87 }
88
89 @property
90 auto save() inout {
91 return this;
92 }
93
94 @property
95 bool empty() const {
96 return t.type == TokenType.End;
97 }
98
99 private:
100 enum Skippable = [" ", "\t", "\v", "\f", "\n", "\r", "\u2028", "\u2029"];
101
102 auto getNextToken() {
103 static getLexerMap() {
104 auto ret = BaseMap;
105
106 foreach (op; Skippable) {
107 ret[op] = "-skip";
108 }
109
110 foreach (kw, _; KeywordMap) {
111 ret[kw] = "lexKeyword";
112 }
113
114 foreach (op, _; OperatorMap) {
115 ret[op] = "lexOperator";
116 }
117
118 return ret;
119 }
120
121 while (true) {
122 import source.lexerutil;
123 // pragma(msg, typeof(this));
124 // pragma(msg, lexerMixin(getLexerMap()));
125 mixin(lexerMixin(getLexerMap()));
126 }
127
128 // Necessary because of https://issues.dlang.org/show_bug.cgi?id=22688
129 assert(0);
130 }
131
132 void setError(ref Token t, string message) {
133 t.type = TokenType.Invalid;
134 t.name = context.getName(message);
135 }
136
137 void popChar() in {
138 assert(index < content.length);
139 } do {
140 index++;
141 }
142
143 void unpopChar() in {
144 assert(index > 1);
145 } do {
146 index--;
147 }
148
149 void popSkippableChars() {
150 static getLexerMap() {
151 string[string] ret;
152
153 foreach (op; Skippable) {
154 ret[op] = "-skip";
155 }
156
157 return ret;
158 }
159
160 while (true) {
161 import source.lexerutil;
162 // pragma(msg, typeof(this));
163 // pragma(msg, lexerMixin(getLexerMap(), "__noop"));
164 mixin(lexerMixin(getLexerMap(), "skip"));
165 }
166 }
167
168 @property
169 char frontChar() const {
170 return content[index];
171 }
172
173 auto skip(string s)() {
174 // Just skip over whitespace.
175 }
176
177 uint popComment(string s)() {
178 auto c = frontChar;
179
180 static if (s == "//") {
181 // TODO: check for unicode line break.
182 while (c != '\n' && c != '\r') {
183 if (c == 0) {
184 return index;
185 }
186
187 popChar();
188 c = frontChar;
189 }
190
191 uint ret = index;
192
193 popChar();
194 if (c == '\r') {
195 if (frontChar == '\n') {
196 popChar();
197 }
198 }
199
200 return ret;
201 } else static if (s == "/*") {
202 while (true) {
203 while (c != '*') {
204 popChar();
205 c = frontChar;
206 }
207
208 auto match = c;
209 popChar();
210 c = frontChar;
211
212 if (c == '/') {
213 popChar();
214 return index;
215 }
216 }
217 } else static if (s == "/+") {
218 uint stack = 0;
219 while (true) {
220 while (c != '+' && c != '/') {
221 popChar();
222 c = frontChar;
223 }
224
225 auto match = c;
226 popChar();
227 c = frontChar;
228
229 switch (match) {
230 case '+' :
231 if (c == '/') {
232 popChar();
233 if (!stack) {
234 return index;
235 }
236
237 c = frontChar;
238 stack--;
239 }
240
241 break;
242
243 case '/' :
244 if (c == '+') {
245 popChar();
246 c = frontChar;
247
248 stack++;
249 }
250
251 break;
252
253 default :
254 assert(0, "Unreachable.");
255 }
256 }
257 } else {
258 static assert(0, s ~ " isn't a known type of comment.");
259 }
260 }
261
262 auto lexComment(string s)() {
263 Token t;
264 t.type = TokenType.Comment;
265
266 uint begin = index - uint(s.length);
267 uint end = popComment!s();
268
269 t.location = base.getWithOffsets(begin, end);
270 return t;
271 }
272
273 static wantIdentifier(char c) {
274 auto hc = c | 0x20;
275 return c == '_' || (c & 0x80) || (hc >= 'a' && hc <= 'z');
276 }
277
278 auto lexIdentifier(string s : "" = "")() {
279 uint begin = index;
280
281 char c = frontChar;
282 if (!wantIdentifier(c)) {
283 // Make sure we don't stay in place.
284 if (c != '\0') {
285 popChar();
286 }
287
288 Token t;
289 setError(t, "Unexpected token");
290 t.location = base.getWithOffsets(begin, index);
291 return t;
292 }
293
294 if (c < 0x80) {
295 popChar();
296 return lexIdentifier(1);
297 }
298
299 // XXX: Dafuq does this need to be a size_t ?
300 size_t i = index;
301
302 import std.utf;
303 auto u = content.decode(i);
304 index = cast(uint) i;
305
306 import std.uni;
307 if (!isAlpha(u)) {
308 Token t;
309 setError(t, "Unexpected token");
310 t.location = base.getWithOffsets(begin, index);
311 return t;
312 }
313
314 return lexIdentifier(index - begin);
315 }
316
317 auto lexIdentifier(string s)() if (s != "") {
318 return lexIdentifier(s.length);
319 }
320
321 auto lexIdentifier(uint prefixLength) in {
322 assert(prefixLength > 0);
323 assert(index >= prefixLength);
324 } do {
325 Token t;
326 t.type = TokenType.Identifier;
327 immutable begin = index - prefixLength;
328
329 while (true) {
330 while (isIdChar(frontChar)) {
331 popChar();
332 }
333
334 if (!(frontChar | 0x80)) {
335 break;
336 }
337
338 // XXX: Dafuq does this need to be a size_t ?
339 size_t i = index;
340
341 import std.utf;
342 auto u = content.decode(i);
343
344 import std.uni;
345 if (!isAlpha(u)) {
346 break;
347 }
348
349 index = cast(uint) i;
350 }
351
352 t.location = base.getWithOffsets(begin, index);
353 t.name = context.getName(content[begin .. index]);
354
355 return t;
356 }
357
358 bool lexEscapeSequence(ref string decoded) {
359 char c = frontChar;
360
361 switch (c) {
362 case '\'', '"', '\\':
363 // Noop.
364 break;
365
366 case '?':
367 assert(0, "WTF is \\?");
368
369 case '0':
370 c = '\0';
371 break;
372
373 case 'a':
374 c = '\a';
375 break;
376
377 case 'b':
378 c = '\b';
379 break;
380
381 case 'f':
382 c = '\f';
383 break;
384
385 case 'r':
386 c = '\r';
387 break;
388
389 case 'n':
390 c = '\n';
391 break;
392
393 case 't':
394 c = '\t';
395 break;
396
397 case 'v':
398 c = '\v';
399 break;
400
401 case 'u', 'U':
402 popChar();
403
404 uint v = 0;
405
406 auto length = 4 * (c == 'U') + 4;
407 foreach (i; 0 .. length) {
408 c = frontChar;
409
410 uint d = c - '0';
411 uint h = ((c | 0x20) - 'a') & 0xff;
412 uint n = (d < 10) ? d : (h + 10);
413
414 if (n >= 16) {
415 return false;
416 }
417
418 v |= n << (4 * (length - i - 1));
419 popChar();
420 }
421
422 char[4] buf;
423
424 import std.utf;
425 auto i = encode(buf, v);
426
427 decoded ~= buf[0 .. i];
428 return true;
429
430 case '&':
431 assert(0, "HTML5 named character references not implemented");
432
433 default:
434 return false;
435 }
436
437 popChar();
438 decoded ~= c;
439 return true;
440 }
441
442 Token lexRawString(char Delimiter = '`')(uint begin) {
443 Token t;
444 t.type = TokenType.StringLiteral;
445
446 size_t start = index;
447
448 auto c = frontChar;
449 while (c != Delimiter && c != '\0') {
450 popChar();
451 c = frontChar;
452 }
453
454 if (c == '\0') {
455 setError(t, "Unexpected end of file");
456 t.location = base.getWithOffsets(begin, index);
457 return t;
458 }
459
460 if (decodeStrings) {
461 string decoded = content[start .. index];
462 t.name = context.getName(decoded);
463 }
464
465 popChar();
466
467 t.location = base.getWithOffsets(begin, index);
468 return t;
469 }
470
471 Token lexString(string s : "`")() {
472 immutable begin = cast(uint) (index - s.length);
473 return lexRawString!'`'(begin);
474 }
475
476 Token lexString(string s : "'")() {
477 immutable begin = cast(uint) (index - s.length);
478 return lexRawString!'\''(begin);
479 }
480
481 Token lexDecodedString(char Delimiter = '"', TokenType TT = TokenType.StringLiteral)(uint begin) {
482 Token t;
483 t.type = TT;
484
485 size_t start = index;
486 string decoded;
487
488 auto c = frontChar;
489 while (c != Delimiter && c != '\0') {
490 if (c == '\\') {
491 immutable beginEscape = index;
492
493 if (decodeStrings) {
494 scope(success) {
495 start = index;
496 }
497
498 // Workaround for https://issues.dlang.org/show_bug.cgi?id=22271
499 if (decoded == "") {
500 decoded = content[start .. index];
501 } else {
502 decoded ~= content[start .. index];
503 }
504
505 popChar();
506 if (!lexEscapeSequence(decoded)) {
507 t.location = base.getWithOffsets(beginEscape, index);
508 setError(t, "Invalid escape sequence");
509 return t;
510 }
511
512 c = frontChar;
513 continue;
514 }
515
516 popChar();
517 c = frontChar;
518 }
519
520 popChar();
521 c = frontChar;
522 }
523
524 if (c == '\0') {
525 setError(t, "Unexpected end of file");
526 t.location = base.getWithOffsets(begin, index);
527 return t;
528 }
529
530 if (decodeStrings) {
531 // Workaround for https://issues.dlang.org/show_bug.cgi?id=22271
532 if (decoded == "") {
533 decoded = content[start .. index];
534 } else {
535 decoded ~= content[start .. index];
536 }
537
538 t.name = context.getName(decoded);
539 }
540
541 popChar();
542
543 t.location = base.getWithOffsets(begin, index);
544 return t;
545 }
546
547 Token lexString(string s : `"`)() {
548 immutable begin = cast(uint) (index - s.length);
549 return lexDecodedString!'"'(begin);
550 }
551
552 Token lexCharacter(string s : `'`)() {
553 immutable begin = cast(uint) (index - s.length);
554 return lexDecodedString!('\'', TokenType.CharacterLiteral)(begin);
555 }
556
557 /**
558 * General integer lexing utilities.
559 */
560 Token lexIntegralSuffix(uint begin) {
561 Token t;
562 t.type = TokenType.IntegerLiteral;
563
564 auto c = frontChar;
565 switch(c | 0x20) {
566 case 'u':
567 popChar();
568
569 c = frontChar;
570 if (c == 'L' || c == 'l') {
571 popChar();
572 }
573
574 break;
575
576 case 'l':
577 popChar();
578
579 c = frontChar;
580 if (c == 'U' || c == 'u') {
581 popChar();
582 }
583
584 break;
585
586 case 'f':
587 popChar();
588
589 t.type = TokenType.FloatLiteral;
590 break;
591
592 default:
593 break;
594 }
595
596 t.location = base.getWithOffsets(begin, index);
597 return t;
598 }
599
600 Token lexFloatSuffix(uint begin) {
601 Token t;
602
603 const c = frontChar;
604 const hc = c | 0x20;
605 if (hc == 'f' || hc == 'l') {
606 popChar();
607 }
608
609 t.location = base.getWithOffsets(begin, index);
610
611 // l is an error for some unexplainable reason.
612 if (c == 'l') {
613 setError(t, "Use 'L' suffix instead of 'l'");
614 return t;
615 }
616
617 t.type = TokenType.FloatLiteral;
618 return t;
619 }
620
621 Token lexFloatLiteral(alias isFun, alias popFun, char E)(uint begin) {
622 popFun();
623
624 bool isFloat = false;
625 if (frontChar == '.') {
626 auto savePoint = index;
627
628 popChar();
629 if (frontChar == '.') {
630 index = savePoint;
631 goto LexSuffix;
632 }
633
634 auto floatSavePoint = index;
635
636 popSkippableChars();
637
638 if (wantIdentifier(frontChar)) {
639 index = savePoint;
640 goto LexSuffix;
641 }
642
643 index = floatSavePoint;
644 isFloat = true;
645
646 if (isFun(frontChar)) {
647 popChar();
648 popFun();
649 }
650 }
651
652 if ((frontChar | 0x20) == E) {
653 isFloat = true;
654 popChar();
655
656 auto c = frontChar;
657 if (c == '+' || c == '-') {
658 popChar();
659 }
660
661 popFun();
662 }
663
664 LexSuffix:
665 return isFloat ? lexFloatSuffix(begin) : lexIntegralSuffix(begin);
666 }
667
668 /**
669 * Binary literals.
670 */
671 static bool isBinary(char c) {
672 return c == '0' || c == '1';
673 }
674
675 void popBinary() {
676 auto c = frontChar;
677 while (isBinary(c) || c == '_') {
678 popChar();
679 c = frontChar;
680 }
681 }
682
683 Token lexNumeric(string s : "0B")() {
684 return lexNumeric!"0b"();
685 }
686
687 Token lexNumeric(string s : "0b")() {
688 uint begin = index - 2;
689
690 while (frontChar == '_') {
691 popChar();
692 }
693
694 if (!isBinary(frontChar)) {
695 Token t;
696 t.location = base.getWithOffsets(begin, index);
697 setError(t, "Invalid binary sequence");
698 return t;
699 }
700
701 popBinary();
702 return lexIntegralSuffix(begin);
703 }
704
705 /**
706 * Hexadecimal literals.
707 */
708 static bool isHexadecimal(char c) {
709 auto hc = c | 0x20;
710 return (c >= '0' && c <= '9') || (hc >= 'a' && hc <= 'f');
711 }
712
713 void popHexadecimal() {
714 auto c = frontChar;
715 while (isHexadecimal(c) || c == '_') {
716 popChar();
717 c = frontChar;
718 }
719 }
720
721 Token lexNumeric(string s : "0X")() {
722 return lexNumeric!"0x"();
723 }
724
725 Token lexNumeric(string s : "0x")() {
726 uint begin = index - 2;
727
728 while (frontChar == '_') {
729 popChar();
730 }
731
732 if (!isHexadecimal(frontChar)) {
733 Token t;
734 t.location = base.getWithOffsets(begin, index);
735 setError(t, "Invalid hexadecimal sequence");
736 return t;
737 }
738
739 return lexFloatLiteral!(isHexadecimal, popHexadecimal, 'p')(begin);
740 }
741
742 /**
743 * Decimal literals.
744 */
745 static bool isDecimal(char c) {
746 return c >= '0' && c <= '9';
747 }
748
749 void popDecimal() {
750 auto c = frontChar;
751 while (isDecimal(c) || c == '_') {
752 popChar();
753 c = frontChar;
754 }
755 }
756
757 auto lexNumeric(string s)() if (s.length == 1 && isDecimal(s[0])) {
758 return lexNumeric(s[0]);
759 }
760
761 auto lexNumeric(char c) in {
762 assert(isDecimal(c));
763 } do {
764 return lexFloatLiteral!(isDecimal, popDecimal, 'e')(index - 1);
765 }
766
767 /**
768 * Keywords and identifiers.
769 */
770 auto lexKeyword(string s)() {
771 auto c = frontChar;
772 if (isIdChar(c)) {
773 popChar();
774 return lexIdentifier(s.length + 1);
775 }
776
777 if (c & 0x80) {
778 size_t i = index;
779
780 import std.utf;
781 auto u = content.decode(i);
782
783 import std.uni;
784 if (isAlpha(u)) {
785 auto l = cast(ubyte) (i - index);
786 index += l;
787 return lexIdentifier(s.length + l);
788 }
789 }
790
791 enum Type = KeywordMap[s];
792
793 uint l = s.length;
794
795 Token t;
796 t.type = Type;
797 t.location = base.getWithOffsets(index - l, index);
798
799 import source.name;
800 t.name = BuiltinName!s;
801
802 return t;
803 }
804
805 auto lexOperator(string s)() {
806 enum Type = OperatorMap[s];
807
808 uint l = s.length;
809
810 Token t;
811 t.type = Type;
812 t.location = base.getWithOffsets(index - l, index);
813
814 import source.name;
815 t.name = BuiltinName!s;
816
817 return t;
818 }
819 }
820
821 @property
822 char front(string s) {
823 return s[0];
824 }
825
826 void popFront(ref string s) {
827 s = s[1 .. $];
828 }
829
830 auto isIdChar(char c) {
831 import std.ascii;
832 return c == '_' || isAlphaNum(c);
833 }
834
835 string lexerMixin(string[string] ids, string def = "lexIdentifier") {
836 return lexerMixin(ids, def, "");
837 }
838
839 private:
840
841 auto stringify(string s) {
842 import std.array;
843 return "`" ~ s.replace("`", "` ~ \"`\" ~ `").replace("\0", "` ~ \"\\0\" ~ `") ~ "`";
844 }
845
846 auto getLexingCode(string fun, string base) {
847 auto args = "!(" ~ stringify(base) ~ ")()";
848
849 switch (fun[0]) {
850 case '-':
851 return "
852 " ~ fun[1 .. $] ~ args ~ ";
853 continue;";
854
855 case '?':
856 size_t i = 1;
857 while (fun[i] != ':') {
858 i++;
859 }
860
861 size_t endcond = i;
862 while (fun[i] != '|') {
863 i++;
864 }
865
866 auto cond = fun[1 .. endcond];
867 auto lexCmd = fun[endcond + 1 .. i];
868 auto skipCmd = fun[i + 1 .. $];
869
870 return "
871 if (" ~ cond ~ ") {
872 return " ~ lexCmd ~ args ~ ";
873 } else {
874 " ~ skipCmd ~ args ~ ";
875 continue;
876 }";
877
878 default:
879 return "
880 return " ~ fun ~ args ~ ";";
881 }
882 }
883
884 string lexerMixin(string[string] ids, string def, string base) {
885 auto defaultFun = def;
886 string[string][char] nextLevel;
887 foreach (id, fun; ids) {
888 if (id == "") {
889 defaultFun = fun;
890 } else {
891 nextLevel[id[0]][id[1 .. $]] = fun;
892 }
893 }
894
895 auto ret = "
896 switch(frontChar) {";
897
898 foreach (c, subids; nextLevel) {
899 // TODO: have a real function to handle that.
900 string charLit;
901 switch(c) {
902 case '\0':
903 charLit = "\\0";
904 break;
905
906 case '\'':
907 charLit = "\\'";
908 break;
909
910 case '\t':
911 charLit = "\\t";
912 break;
913
914 case '\v':
915 charLit = "\\v";
916 break;
917
918 case '\f':
919 charLit = "\\f";
920 break;
921
922 case '\n':
923 charLit = "\\n";
924 break;
925
926 case '\r':
927 charLit = "\\r";
928 break;
929
930 default:
931 if (c < 0x80) {
932 charLit = [c];
933 break;
934 }
935
936 static char toHexChar(ubyte n) {
937 return ((n < 10) ? (n + '0') : (n - 10 + 'a')) & 0xff;
938 }
939
940 static string toHexString(ubyte c) {
941 return [toHexChar(c >> 4), toHexChar(c & 0x0f)];
942 }
943
944 charLit = "\\x" ~ toHexString(c);
945 break;
946 }
947
948 ret ~= "
949 case '" ~ charLit ~ "':
950 popChar();";
951
952 auto newBase = base ~ c;
953 if (subids.length == 1) {
954 if (auto cdef = "" in subids) {
955 ret ~= getLexingCode(*cdef, newBase);
956 continue;
957 }
958 }
959
960 ret ~= lexerMixin(nextLevel[c], def, newBase);
961 }
962
963 if (base == "" || base[$ - 1] < 0x80) {
964 ret ~= "
965 default:" ~ getLexingCode(defaultFun, base) ~ "
966 }
967 ";
968 } else {
969 ret ~= "
970 default:
971 // Do not exit in the middle of an unicode sequence.
972 unpopChar();
973 break;
974 }
975
976 // Fall back to the default instead.
977 goto default;
978 ";
979 }
980
981 return ret;
982 }