source.jsonlexer source code

1 module source.jsonlexer;
2 
3 import source.context;
4 import source.location;
5 
6 enum TokenType {
7 	Invalid = 0,
8 	
9 	Begin,
10 	End,
11 	
12 	// Comments
13 	Comment,
14 	
15 	// Literals
16 	StringLiteral,
17 	CharacterLiteral = StringLiteral,
18 	IntegerLiteral,
19 	FloatLiteral,
20 	
21 	// Identifier
22 	Identifier,
23 	
24 	// Keywords
25 	Null, True, False,
26 	
27 	// Operators.
28 	OpenParen,    // (
29 	CloseParen,   // )
30 	OpenBracket,  // [
31 	CloseBracket, // ]
32 	OpenBrace,    // {
33 	CloseBrace,   // }
34 	Comma,        // ,
35 	Colon,        // :
36 }
37 
38 struct Token {
39 	import source.location;
40 	Location location;
41 	
42 	TokenType type;
43 	
44 	import source.name;
45 	Name name;
46 	
47 	import source.context;
48 	string toString(Context context) {
49 		return (type >= TokenType.Identifier)
50 			? name.toString(context)
51 			: location.getFullLocation(context).getSlice();
52 	}
53 }
54 
55 auto lex(Position base, Context context) {
56 	auto lexer = JsonLexer();
57 	
58 	lexer.content = base.getFullPosition(context).getSource().getContent();
59 	lexer.t.type = TokenType.Begin;
60 	
61 	lexer.context = context;
62 	lexer.base = base;
63 	lexer.previous = base;
64 	
65 	lexer.t.location =  Location(base, base.getWithOffset(lexer.index));
66 	return lexer;
67 }
68 
69 struct JsonLexer {
70 	enum BaseMap = () {
71 		auto ret = [
72 			// Comments
73 			"//" : "?tokenizeComments:lexComment|popComment",
74 			"/*" : "?tokenizeComments:lexComment|popComment",
75 			"/+" : "?tokenizeComments:lexComment|popComment",
76 			
77 			// Integer literals.
78 			"0b" : "lexNumeric",
79 			"0B" : "lexNumeric",
80 			"0x" : "lexNumeric",
81 			"0X" : "lexNumeric",
82 			
83 			// String literals.
84 			`"` : "lexString",
85 			"'" : "lexString",
86 		];
87 		
88 		foreach (i; 0 .. 10) {
89 			import std.conv;
90 			ret[to!string(i)] = "lexNumeric";
91 		}
92 		
93 		return ret;
94 	}();
95 	
96 	enum KeywordMap = [
97 		"null"  : TokenType.Null,
98 		"true"  : TokenType.True,
99 		"false" : TokenType.False,
100 	];
101 	
102 	enum OperatorMap = [
103 		"("  : TokenType.OpenParen,
104 		")"  : TokenType.CloseParen,
105 		"["  : TokenType.OpenBracket,
106 		"]"  : TokenType.CloseBracket,
107 		"{"  : TokenType.OpenBrace,
108 		"}"  : TokenType.CloseBrace,
109 		","  : TokenType.Comma,
110 		":"  : TokenType.Colon,
111 		"\0" : TokenType.End,
112 	];
113 	
114 	import source.lexbase;
115 	mixin LexBaseImpl!(Token, BaseMap, KeywordMap, OperatorMap);
116 	
117 	import source.lexnumeric;
118 	mixin LexNumericImpl!(Token, [
119 		"" : TokenType.IntegerLiteral,
120 	], [
121 		"" : TokenType.FloatLiteral,
122 	]);
123 	
124 	import source.lexstring;
125 	mixin LexStringImpl!(Token, [
126 		"" : TokenType.StringLiteral,
127 	]);
128 }
129 
130 unittest {
131 	auto context = new Context();
132 	
133 	auto testlexer(string s) {
134 		import source.name;
135 		auto base = context.registerMixin(Location.init, s ~ '\0');
136 		return lex(base, context);
137 	}
138 	
139 	import source.parserutil;
140 	
141 	{
142 		auto lex = testlexer("");
143 		lex.match(TokenType.Begin);
144 		assert(lex.front.type == TokenType.End);
145 	}
146 	
147 	{
148 		auto lex = testlexer("null(aa[{]true})false");
149 		lex.match(TokenType.Begin);
150 		lex.match(TokenType.Null);
151 		lex.match(TokenType.OpenParen);
152 		
153 		auto t = lex.front;
154 		assert(t.type == TokenType.Identifier);
155 		assert(t.toString(context) == "aa");
156 		
157 		lex.popFront();
158 		lex.match(TokenType.OpenBracket);
159 		lex.match(TokenType.OpenBrace);
160 		lex.match(TokenType.CloseBracket);
161 		lex.match(TokenType.True);
162 		lex.match(TokenType.CloseBrace);
163 		lex.match(TokenType.CloseParen);
164 		lex.match(TokenType.False);
165 		
166 		assert(lex.front.type == TokenType.End);
167 	}
168 	
169 	{
170 		auto lex = testlexer(`"""foobar"'''balibalo'"\""'"'"'"`);
171 		lex.match(TokenType.Begin);
172 		
173 		foreach (expected; [`""`, `"foobar"`, `''`, `'balibalo'`, `"\""`, `'"'`, `"'"`]) {
174 			auto t = lex.front;
175 			
176 			assert(t.type == TokenType.StringLiteral);
177 			assert(t.toString(context) == expected);
178 			lex.popFront();
179 		}
180 		
181 		assert(lex.front.type == TokenType.End);
182 	}
183 	
184 	// Check unterminated strings.
185 	{
186 		auto lex = testlexer(`"`);
187 		lex.match(TokenType.Begin);
188 		
189 		auto t = lex.front;
190 		assert(t.type == TokenType.Invalid);
191 	}
192 	
193 	{
194 		auto lex = testlexer(`"\`);
195 		lex.match(TokenType.Begin);
196 		
197 		auto t = lex.front;
198 		assert(t.type == TokenType.Invalid);
199 	}
200 	
201 	{
202 		auto lex = testlexer(`'`);
203 		lex.match(TokenType.Begin);
204 		
205 		auto t = lex.front;
206 		assert(t.type == TokenType.Invalid);
207 	}
208 	
209 	{
210 		auto lex = testlexer(`'\`);
211 		lex.match(TokenType.Begin);
212 		
213 		auto t = lex.front;
214 		assert(t.type == TokenType.Invalid);
215 	}
216 	
217 	// Check unicode support
218 	{
219 		auto lex = testlexer(`"\U0001F0BD\u0393α\u1FD6\u03B1\U0001FA01🙈🙉🙊\U0001F71A"`);
220 		lex.match(TokenType.Begin);
221 		
222 		auto t = lex.front;
223 		
224 		assert(t.type == TokenType.StringLiteral);
225 		assert(t.name.toString(context) == "🂽Γαῖα🨁🙈🙉🙊🜚");
226 		lex.popFront();
227 		
228 		assert(lex.front.type == TokenType.End);
229 	}
230 	
231 	{
232 		auto lex = testlexer(`"\U0001F0B"`);
233 		lex.match(TokenType.Begin);
234 		assert(lex.front.type == TokenType.Invalid);
235 	}
236 	
237 	{
238 		auto lex = testlexer(`"\u039"`);
239 		lex.match(TokenType.Begin);
240 		assert(lex.front.type == TokenType.Invalid);
241 	}
242 	
243 	{
244 		auto lex = testlexer(`"\u039G"`);
245 		lex.match(TokenType.Begin);
246 		assert(lex.front.type == TokenType.Invalid);
247 	}
248 	
249 	{
250 		auto lex = testlexer(`"\u03@3"`);
251 		lex.match(TokenType.Begin);
252 		assert(lex.front.type == TokenType.Invalid);
253 	}
254 }