source.lexbase source code

1 module source.lexbase;
2 
3 mixin template LexBaseImpl(Token, alias BaseMap, alias KeywordMap, alias OperatorMap) {
4 // TODO: We shouldn't let consumer play with the internal state of the lexer.
5 // Instead, we should provide accessor to useful members.
6 // private:
7 	Token t;
8 	
9 	import source.location;
10 	Position previous;
11 	Position base;
12 	
13 	uint index;
14 	
15 	import std.bitmanip;
16 	mixin(bitfields!(
17 		bool, "tokenizeComments", 1,
18 		bool, "_skipStrings", 1,
19 		uint, "__derived", 30,
20 	));
21 	
22 	import source.context;
23 	Context context;
24 	
25 	string content;
26 	
27 	alias TokenRange = typeof(this);
28 	alias TokenType = typeof(Token.init.type);
29 	
30 	auto withComments(bool wc = true) {
31 		auto r = this.save;
32 		r.tokenizeComments = wc;
33 		return r;
34 	}
35 	
36 	@property
37 	bool decodeStrings() const {
38 		return !_skipStrings;
39 	}
40 	
41 	auto withStringDecoding(bool sd = true) {
42 		auto r = this.save;
43 		r._skipStrings = !sd;
44 		return r;
45 	}
46 	
47 	/**
48 	 * Return a copy of this lexer that:
49 	 *  - skip over comments.
50 	 *  - do not decode strings.
51 	 */
52 	auto getLookahead() {
53 		return withStringDecoding(false).withComments(false);
54 	}
55 	
56 	@property
57 	auto front() inout {
58 		return t;
59 	}
60 	
61 	void popFront() in {
62 		assert(front.type != TokenType.End);
63 	} do {
64 		previous = t.location.stop;
65 		t = getNextToken();
66 		
67 		/+
68 		// Exprerience the token deluge !
69 		if (t.type != TokenType.End) {
70 			import util.terminal, std.conv;
71 			outputCaretDiagnostics(
72 				t.location.getFullLocation(context),
73 				to!string(t.type),
74 			);
75 		}
76 		// +/
77 	}
78 	
79 	void moveTo(ref TokenRange fr) in {
80 		assert(base is fr.base);
81 		assert(context is fr.context);
82 		assert(content is fr.content);
83 		assert(index < fr.index);
84 	} do {
85 		index = fr.index;
86 		t = fr.t;
87 	}
88 	
89 	@property
90 	auto save() inout {
91 		return this;
92 	}
93 	
94 	@property
95 	bool empty() const {
96 		return t.type == TokenType.End;
97 	}
98 	
99 private:
100 	enum Skippable = [" ", "\t", "\v", "\f", "\n", "\r", "\u2028", "\u2029"];
101 
102 	auto getNextToken() {
103 		static getLexerMap() {
104 			auto ret = BaseMap;
105 			
106 			foreach (op; Skippable) {
107 				ret[op] = "-skip";
108 			}
109 			
110 			foreach (kw, _; KeywordMap) {
111 				ret[kw] = "lexKeyword";
112 			}
113 			
114 			foreach (op, _; OperatorMap) {
115 				ret[op] = "lexOperator";
116 			}
117 			
118 			return ret;
119 		}
120 		
121 		while (true) {
122 			import source.lexbase;
123 			// pragma(msg, typeof(this));
124 			// pragma(msg, lexerMixin(getLexerMap()));
125 			mixin(lexerMixin(getLexerMap()));
126 		}
127 		
128 		// Necessary because of https://issues.dlang.org/show_bug.cgi?id=22688
129 		assert(0);
130 	}
131 	
132 	Token getError(uint begin, string message) {
133 		Token t;
134 		t.type = TokenType.Invalid;
135 		t.name = context.getName(message);
136 		t.location = base.getWithOffsets(begin, index);
137 		return t;
138 	}
139 	
140 	void popChar() in {
141 		assert(index < content.length);
142 	} do {
143 		index++;
144 	}
145 	
146 	void unpopChar() in {
147 		assert(index > 1);
148 	} do {
149 		index--;
150 	}
151 	
152 	void popSkippableChars() {
153 		static getLexerMap() {
154 			string[string] ret;
155 			
156 			foreach (op; Skippable) {
157 				ret[op] = "-skip";
158 			}
159 			
160 			return ret;
161 		}
162 		
163 		while (true) {
164 			import source.lexbase;
165 			// pragma(msg, typeof(this));
166 			// pragma(msg, lexerMixin(getLexerMap(), "__noop"));
167 			mixin(lexerMixin(getLexerMap(), "skip"));
168 		}
169 	}
170 	
171 	@property
172 	char frontChar() const {
173 		return content[index];
174 	}
175 	
176 	auto skip(string s)() {
177 		// Just skip over whitespace.
178 	}
179 	
180 	/**
181 	 * Identifiers.
182 	 */
183 	static wantIdentifier(char c) {
184 		auto hc = c | 0x20;
185 		return c == '_' || (c & 0x80) || (hc >= 'a' && hc <= 'z');
186 	}
187 
188 	auto popIdChars() {
189 		const begin = index;
190 		while (true) {
191 			char c = frontChar;
192 			
193 			import std.ascii : isAlphaNum;
194 			while (c == '_' || isAlphaNum(c)) {
195 				popChar();
196 				c = frontChar;
197 			}
198 			
199 			if (c < 0x80) {
200 				break;
201 			}
202 			
203 			// This needs to be a size_t.
204 			size_t i = index;
205 			
206 			import std.utf;
207 			auto u = content.decode(i);
208 			
209 			import std.uni : isAlpha;
210 			if (!isAlpha(u)) {
211 				break;
212 			}
213 			
214 			index = cast(uint) i;
215 		}
216 		
217 		return begin - index;
218 	}
219 
220 	auto lexIdentifier(string s : "" = "")() {
221 		uint begin = index;
222 
223 		char c = frontChar;
224 		if (wantIdentifier(c) && popIdChars() > 0) {
225 			Token t;
226 			t.type = TokenType.Identifier;
227 			t.location = base.getWithOffsets(begin, index);
228 			t.name = context.getName(content[begin .. index]);
229 
230 			return t;
231 		}
232 
233 		// Make sure we don't stay in place.
234 		if (c | 0x80) {
235 			import std.utf;
236 			size_t i = index;
237 			content.decode(i);
238 			index = cast(uint) i;
239 		} else if (c != '\0') {
240 			popChar();
241 		}
242 
243 		return getError(begin, "Unexpected token.");
244 	}
245 
246 	auto lexIdentifier(string s)() if (s != "") {
247 		uint l = s.length;
248 		return lexIdentifier(index - l);
249 	}
250 	
251 	auto lexIdentifier(uint begin) {
252 		popIdChars();
253 
254 		Token t;
255 		t.type = TokenType.Identifier;
256 		t.location = base.getWithOffsets(begin, index);
257 		t.name = context.getName(content[begin .. index]);
258 		
259 		return t;
260 	}
261 	
262 	/**
263 	 * Operators.
264 	 */
265 	auto lexOperator(string s)() {
266 		enum Type = OperatorMap[s];
267 		uint l = s.length;
268 		
269 		Token t;
270 		t.type = Type;
271 		t.location = base.getWithOffsets(index - l, index);
272 		t.name = BuiltinName!s;
273 		
274 		return t;
275 	}
276 	
277 	/**
278 	 * Keywords.
279 	 */
280 	auto lexKeyword(string s)() {
281 		enum Type = KeywordMap[s];
282 		uint l = s.length;
283 		
284 		return lexKeyword(index - l, Type, BuiltinName!s);
285 	}
286 	
287 	import source.name;
288 	auto lexKeyword(uint begin, TokenType type, Name keyword) {
289 		auto idCharCount = popIdChars();
290 
291 		Token t;
292 		t.type = type;
293 		t.name = keyword;
294 		t.location = base.getWithOffsets(begin, index);
295 		
296 		if (idCharCount == 0) {
297 			return t;
298 		}
299 
300 		// This is an identifier that happened to start
301 		// like a keyword.
302 		t.type = TokenType.Identifier;
303 		t.name = context.getName(content[begin .. index]);
304 		
305 		return t;
306 	}
307 	
308 	/**
309 	 * Utilities to handle literals suffixes.
310 	 */
311 	auto lexLiteralSuffix(alias Suffixes, alias CustomSuffixes = null)(uint begin) {
312 		const prefixStart = index;
313 		alias fun = lexLiteralSuffixTpl!Suffixes.fun;
314 		
315 		static getLexerMap() {
316 			string[string] ret = CustomSuffixes;
317 			
318 			foreach (op, _; Suffixes) {
319 				ret[op] = "fun";
320 			}
321 			
322 			return ret;
323 		}
324 		
325 		while (true) {
326 			import source.lexbase;
327 			mixin(lexerMixin(getLexerMap(), "fun", ["begin", "prefixStart"]));
328 		}
329 	}
330 	
331 	template lexLiteralSuffixTpl(alias Suffixes) {
332 		auto fun(string s)(uint begin, uint prefixStart) {
333 			enum Kind = Suffixes[s];
334 			auto idCharCount = popIdChars();
335 			
336 			if (idCharCount != 0) {
337 				// We have something else.
338 				return getError(prefixStart, "Invalid suffix: " ~ content[prefixStart .. index]);
339 			}
340 			
341 			Token t;
342 			t.type = Kind;
343 			t.location = base.getWithOffsets(begin, index);
344 			
345 			return t;
346 		}
347 	}
348 		
349 	/**
350 	 * Comments.
351 	 */
352 	uint popComment(string s)() {
353 		auto c = frontChar;
354 		
355 		static if (s == "//") {
356 			// TODO: check for unicode line break.
357 			while (c != '\n' && c != '\r') {
358 				if (c == 0) {
359 					return index;
360 				}
361 				
362 				popChar();
363 				c = frontChar;
364 			}
365 			
366 			uint ret = index;
367 			
368 			popChar();
369 			if (c == '\r') {
370 				if (frontChar == '\n') {
371 					popChar();
372 				}
373 			}
374 			
375 			return ret;
376 		} else static if (s == "/*") {
377 			while (true) {
378 				while (c != '*') {
379 					popChar();
380 					c = frontChar;
381 				}
382 				
383 				auto match = c;
384 				popChar();
385 				c = frontChar;
386 				
387 				if (c == '/') {
388 					popChar();
389 					return index;
390 				}
391 			}
392 		} else static if (s == "/+") {
393 			uint stack = 0;
394 			while (true) {
395 				while (c != '+' && c != '/') {
396 					popChar();
397 					c = frontChar;
398 				}
399 				
400 				auto match = c;
401 				popChar();
402 				c = frontChar;
403 				
404 				switch (match) {
405 					case '+' :
406 						if (c == '/') {
407 							popChar();
408 							if (!stack) {
409 								return index;
410 							}
411 							
412 							c = frontChar;
413 							stack--;
414 						}
415 						
416 						break;
417 					
418 					case '/' :
419 						if (c == '+') {
420 							popChar();
421 							c = frontChar;
422 							
423 							stack++;
424 						}
425 						
426 						break;
427 					
428 					default :
429 						assert(0, "Unreachable.");
430 				}
431 			}
432 		} else {
433 			static assert(0, s ~ " isn't a known type of comment.");
434 		}
435 	}
436 	
437 	auto lexComment(string s)() {
438 		Token t;
439 		t.type = TokenType.Comment;
440 		
441 		uint begin = index - uint(s.length);
442 		uint end = popComment!s();
443 		
444 		t.location = base.getWithOffsets(begin, end);
445 		return t;
446 	}
447 }
448 
449 @property
450 char front(string s) {
451 	return s[0];
452 }
453 
454 void popFront(ref string s) {
455 	s = s[1 .. $];
456 }
457 
458 string lexerMixin(string[string] ids, string def = "lexIdentifier", string[] rtArgs = []) {
459 	return lexerMixin(ids, def, rtArgs, "");
460 }
461 
462 private:
463 
464 auto stringify(string s) {
465 	import std.array;
466 	return "`" ~ s.replace("`", "` ~ \"`\" ~ `").replace("\0", "` ~ \"\\0\" ~ `") ~ "`";
467 }
468 
469 auto getLexingCode(string fun, string[] rtArgs, string base) {
470 	import std.array;
471 	auto args = "!(" ~ stringify(base) ~ ")(" ~ rtArgs.join(", ") ~ ")";
472 	
473 	switch (fun[0]) {
474 		case '-':
475 			return "
476 				" ~ fun[1 .. $] ~ args ~ ";
477 				continue;";
478 			
479 		case '?':
480 			size_t i = 1;
481 			while (fun[i] != ':') {
482 				i++;
483 			}
484 			
485 			size_t endcond = i;
486 			while (fun[i] != '|') {
487 				i++;
488 			}
489 			
490 			auto cond = fun[1 .. endcond];
491 			auto lexCmd = fun[endcond + 1 .. i];
492 			auto skipCmd = fun[i + 1 .. $];
493 			
494 			return "
495 				if (" ~ cond ~ ") {
496 					return " ~ lexCmd ~ args ~ ";
497 				} else {
498 					" ~ skipCmd ~ args ~ ";
499 					continue;
500 				}";
501 			
502 		default:
503 			return "
504 				return " ~ fun ~ args ~ ";";
505 	}
506 }
507 
508 string lexerMixin(string[string] ids, string def, string[] rtArgs, string base) {
509 	auto defaultFun = def;
510 	string[string][char] nextLevel;
511 	foreach (id, fun; ids) {
512 		if (id == "") {
513 			defaultFun = fun;
514 		} else {
515 			nextLevel[id[0]][id[1 .. $]] = fun;
516 		}
517 	}
518 	
519 	auto ret = "
520 		switch(frontChar) {";
521 	
522 	foreach (c, subids; nextLevel) {
523 		// TODO: have a real function to handle that.
524 		string charLit;
525 		switch(c) {
526 			case '\0':
527 				charLit = "\\0";
528 				break;
529 			
530 			case '\'':
531 				charLit = "\\'";
532 				break;
533 			
534 			case '\t':
535 				charLit = "\\t";
536 				break;
537 			
538 			case '\v':
539 				charLit = "\\v";
540 				break;
541 			
542 			case '\f':
543 				charLit = "\\f";
544 				break;
545 			
546 			case '\n':
547 				charLit = "\\n";
548 				break;
549 			
550 			case '\r':
551 				charLit = "\\r";
552 				break;
553 			
554 			default:
555 				if (c < 0x80) {
556 					charLit = [c];
557 					break;
558 				}
559 				
560 				static char toHexChar(ubyte n) {
561 					return ((n < 10) ? (n + '0') : (n - 10 + 'a')) & 0xff;
562 				}
563 				
564 				static string toHexString(ubyte c) {
565 					return [toHexChar(c >> 4), toHexChar(c & 0x0f)];
566 				}
567 				
568 				charLit = "\\x" ~ toHexString(c);
569 				break;
570 		}
571 		
572 		ret ~= "
573 			case '" ~ charLit ~ "':
574 				popChar();";
575 		
576 		auto newBase = base ~ c;
577 		if (subids.length == 1) {
578 			if (auto cdef = "" in subids) {
579 				ret ~= getLexingCode(*cdef, rtArgs, newBase);
580 				continue;
581 			}
582 		}
583 		
584 		ret ~= lexerMixin(nextLevel[c], def, rtArgs, newBase);
585 	}
586 	
587 	if (base == "" || base[$ - 1] < 0x80) {
588 		ret ~= "
589 			default:" ~ getLexingCode(defaultFun, rtArgs, base) ~ "
590 		}
591 		";
592 	} else {
593 		ret ~= "
594 			default:
595 				// Do not exit in the middle of an unicode sequence.
596 				unpopChar();
597 				break;
598 		}
599 			
600 			// Fall back to the default instead.
601 			goto default;
602 			";
603 	}
604 	
605 	return ret;
606 }