1 module vayne.source.lexer;
2 
3 
4 import std.array;
5 import std.ascii;
6 import std.exception;
7 import std.format;
8 import std..string;
9 
10 
11 import vayne.source.source;
12 import vayne.source.token;
13 
14 
15 
16 class LexerException : Exception {
17 	this(T)(T* ptr, string msg) {
18 		super(msg);
19 		this.ptr = cast(void*)ptr;
20 	}
21 
22 	void* ptr;
23 }
24 
25 
26 struct Lexer {
27 	this(Source source, SourceLoc loc) {
28 		source_ = source;
29 		empty_ = false;
30 		loc_ = loc;
31 
32 		popFront;
33 	}
34 
35 	this(Source source) {
36 		this(source, SourceLoc(source.id, 1));
37 	}
38 
39 	void popFront() {
40 		assert(!empty);
41 
42 		if (current_.kind == Token.Kind.EndOfInput) {
43 			empty_ = true;
44 		} else {
45 			eat;
46 		}
47 	}
48 
49 	Token front() const {
50 		return current_;
51 	}
52 
53 	bool empty() const {
54 		return empty_;
55 	}
56 
57 private:
58 	void eat() {
59 		auto ptr = source_.buffer.ptr;
60 		scope (exit) source_.ptr = ptr;
61 		auto end = source_.end;
62 
63 		eatSpaces(ptr, end);
64 
65 		while (ptr != end) {
66 			auto start = ptr;
67 			auto ch = *ptr++;
68 
69 			switch(ch) {
70 			case '\"':
71 			case '\'':
72 			case '`':
73 				current_ = eatString(ch, ptr, end);
74 				return;
75 			case '/':
76 				if (ptr != end) {
77 					if (*ptr == ch) {
78 						eatUntil('\n', ptr, end);
79 						eatSpaces(ptr, end);
80 						continue;
81 					} else if (*ptr == '*') {
82 						eatBlockComment(ptr, end);
83 						eatSpaces(ptr, end);
84 						continue;
85 					}
86 				}
87 				current_ = eatSep(ch, ptr - 1, ptr, end);
88 				return;
89 			case '.':
90 				if (ptr != end) {
91 					if (*ptr == '.') {
92 						++ptr;
93 						if ((ptr != end) && (*ptr == '.'))
94 							++ptr;
95 					} else if (isDigit(*ptr)) {
96 						current_ = eatNumeric(ch, ptr - 1, ptr, end);
97 						return;
98 					}
99 				}
100 				current_ = Token(Token.Kind.Separator, start[0..ptr - start], loc_);
101 				return;
102 			default:
103 				if (isAlpha(ch) || (ch == '_')) {
104 					while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_')))
105 						++ptr;
106 					auto length = ptr - start;
107 					auto name = start[0..length];
108 
109 					switch(length) {
110 					case 2:
111 						if (name == "in") {
112 							current_ = Token(name, Token.KeywordKind.In, 0, loc_);
113 							return;
114 						}
115 						if (name == "as") {
116 							current_ = Token(name, Token.KeywordKind.As, 0, loc_);
117 							return;
118 						}
119 						break;
120 					case 3:
121 						if (name == "def") {
122 							current_ = Token(name, Token.KeywordKind.Def, 0, loc_);
123 							return;
124 						}
125 						if (name == "set") {
126 							current_ = Token(name, Token.KeywordKind.Set, 0, loc_);
127 							return;
128 						}
129 						if (name == "pop") {
130 							current_ = Token(name, Token.KeywordKind.Pop, 0, loc_);
131 							return;
132 						}
133 						break;
134 					case 4:
135 						if (name == "null") {
136 							current_ = Token(name, Token.KeywordKind.Null, 0, loc_);
137 							return;
138 						}
139 						if (name == "true") {
140 							current_ = Token(name, Token.KeywordKind.True, 0, loc_);
141 							return;
142 						}
143 						if (name == "push") {
144 							current_ = Token(name, Token.KeywordKind.Push, 0, loc_);
145 							return;
146 						}
147 						break;
148 					case 5:
149 						if (name == "false") {
150 							current_ = Token(name, Token.KeywordKind.False, 0, loc_);
151 							return;
152 						}
153 						if (name == "undef") {
154 							current_ = Token(name, Token.KeywordKind.Undef, 0, loc_);
155 							return;
156 						}
157 						break;
158 					default:
159 						break;
160 					}
161 					current_ = Token(Token.Kind.Identifier, start[0..length], loc_);
162 				} else if (isDigit(ch)) {
163 					current_ = eatNumeric(ch, ptr - 1, ptr, end);
164 				} else {
165 					current_ = eatSep(ch, ptr - 1, ptr, end);
166 				}
167 				return;
168 			}
169 		}
170 
171 		if (ptr >= end)
172 			current_ = Token(Token.Kind.EndOfInput, null, loc_);
173 	}
174 
175 	void eatSpaces(T)(ref T* ptr, ref T* end) {
176 		while ((ptr != end) && isWhite(*ptr)) {
177 			if (*ptr == '\n')
178 				++loc_.line;
179 			++ptr;
180 		}
181 	}
182 
183 	void eatUntil(T)(T ch, ref T* ptr, T* end) {
184 		while ((ptr != end) && (*ptr != ch)) {
185 			if (*ptr == '\n')
186 				++loc_.line;
187 			++ptr;
188 		}
189 	}
190 
191 	void eatBlockComment(T)(ref T* ptr, T* end) {
192 		auto start = ptr;
193 		auto opens = 1;
194 
195 		while (opens) {
196 			if (ptr == end)
197 				throw new LexerException(start, "unterminated block comment");
198 
199 			while((ptr != end) && (*ptr != '/')) {
200 				if (*ptr == '\n')
201 					++loc_.line;
202 				++ptr;
203 			}
204 
205 			if (*ptr == '/') {
206 				if (*(ptr - 1) == '*') {
207 					++ptr;
208 					--opens;
209 				} else {
210 					++ptr;
211 					if ((ptr != end) && (*ptr == '*')) {
212 						++ptr;
213 						++opens;
214 					}
215 				}
216 			}
217 		}
218 	}
219 
220 	Token eatSep(T)(T ch, T* start, ref T* ptr, T* end) {
221 		assert(!isWhite(ch));
222 
223 		auto doubleEnabled = false;
224 		auto equalsEnabled = false;
225 		auto doubleEquals = false;
226 
227 		if (ptr != end) {
228 			switch(ch) {
229 			case '>':
230 			case '<':
231 			case '^':
232 				doubleEquals = true;
233 				doubleEnabled = true;
234 				equalsEnabled = true;
235 				goto default;
236 			case '+':
237 			case '-':
238 			case '|':
239 			case '&':
240 				doubleEnabled = true;
241 				equalsEnabled = true;
242 				goto default;
243 			case '*':
244 			case '/':
245 			case '%':
246 			case '!':
247 			case '~':
248 			case '=':
249 				equalsEnabled = true;
250 				goto default;
251 			default:
252 				if (doubleEnabled) {
253 					if (*ptr == ch) {
254 						++ptr;
255 						if (doubleEquals && (ptr != end)) {
256 							assert(doubleEnabled);
257 							if (*ptr == '=')
258 								++ptr;
259 						}
260 					} else if (equalsEnabled && (*ptr == '=')) {
261 						++ptr;
262 					}
263 				} else if (equalsEnabled) {
264 					if (*ptr == '=')
265 						++ptr;
266 				}
267 			}
268 		}
269 
270 		return Token(Token.Kind.Separator, start[0..ptr - start], loc_);
271 	}
272 
273 	size_t eatSuffix(T)(ref T* ptr, T* end) {
274 		auto start = ptr;
275 
276 		if (ptr != end) {
277 			if (isAlpha(*ptr) || (*ptr == '_')) {
278 				++ptr;
279 				while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_')))
280 					++ptr;
281 			}
282 		}
283 
284 		return ptr - start;
285 	}
286 
287 	Token eatFloat(T)(T ch, T* start, ref T* ptr, T* end) {
288 		auto dot = (ch == '.');
289 
290 		while ((ptr != end) && ((isDigit(*ptr) || (*ptr == '\'') || (!dot && (*ptr == '.') && (*(ptr + 1) != '.'))))) {
291 			if (*ptr == '.')
292 				dot = true;
293 			++ptr;
294 		}
295 
296 		size_t suffixSize;
297 		if (ptr != end) {
298 			if ((*ptr == 'e') || (*ptr == 'E')) {
299 				++ptr;
300 				if (ptr != end) {
301 					if ((*ptr == '-') || (*ptr == '+'))
302 						++ptr;
303 					while ((ptr != end) && (isDigit(*ptr)))
304 						++ptr;
305 				}
306 				if (!isDigit(*(ptr - 1)))
307 					throw new LexerException(ptr - 1, "invalid exponent in floating-point literal");
308 			}
309 
310 			suffixSize = eatSuffix(ptr, end);
311 		}
312 
313 		return Token(start[0..((ptr - start) - suffixSize)], (dot ? Token.LiteralKind.Float : Token.LiteralKind.Dec), suffixSize, 0, loc_);
314 	}
315 
316 	Token eatNumeric(T)(T ch, T* start, ref T* ptr, T* end) {
317 		if (ch == '0') {
318 			auto base = Token.LiteralKind.Dec;
319 
320 			size_t suffixSize;
321 			if (ptr != end) {
322 				switch (std.ascii.toLower(*ptr)) {
323 				case 'x':
324 					if (isHexDigit(*(ptr + 1))) {
325 						++ptr;
326 						base = Token.LiteralKind.Hex;
327 						while ((ptr != end) && (isHexDigit(*ptr)))
328 							++ptr;
329 					}
330 					break;
331 				case 'b':
332 					if ((*(ptr + 1) == '0') || (*(ptr + 1) == '1')) {
333 						++ptr;
334 						while ((ptr != end) && ((*ptr == '0') || (*ptr == '1')))
335 							++ptr;
336 						base = Token.LiteralKind.Bin;
337 					}
338 					break;
339 				case 'o':
340 					if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '7')) {
341 						++ptr;
342 						while ((ptr != end) && (*ptr >= '0') && (*ptr <= '7'))
343 							++ptr;
344 						base = Token.LiteralKind.Oct;
345 					}
346 					break;
347 				case '.':
348 					return eatFloat(ch, start, ptr, end);
349 				case 'd':
350 					if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '9')) {
351 						start = ptr + 1;
352 						++ptr;
353 					}
354 					goto default;
355 				default:
356 					while ((ptr != end) && isDigit(*ptr))
357 						++ptr;
358 					break;
359 				}
360 
361 				suffixSize = eatSuffix(ptr, end);
362 			}
363 			return Token(start[0..((ptr - start) - suffixSize)], base, suffixSize, 0, loc_);
364 		} else {
365 			return eatFloat(ch, start, ptr, end);
366 		}
367 	}
368 
369 	Token eatString(immutable(char) ch, ref immutable(char)* ptr, immutable(char)* end) {
370 		auto start = ptr;
371 		auto needsUnescaping = false;
372 
373 		while ((ptr != end) && (*ptr != ch)) {
374 			if (*ptr != '\\') {
375 				if (*ptr == '\n')
376 					++loc_.line;
377 				++ptr;
378 			} else {
379 				++ptr;
380 				if (ptr != end) {
381 					needsUnescaping = true;
382 					++ptr;
383 				}
384 			}
385 		}
386 
387 		if (*ptr != ch)
388 			throw new LexerException(start, "unterminated string-literal");
389 
390 		++ptr;
391 		size_t flags = 0;
392 		if (needsUnescaping && (ch != '`'))
393 			flags |= Token.Flags.NeedsUnescaping;
394 
395 		auto suffixSize = eatSuffix(ptr, end);
396 
397 		return Token(start[0..(ptr - start - suffixSize - 1)], (((ch == '\"') || (ch == '`')) ? Token.LiteralKind.String : Token.LiteralKind.Char), suffixSize, flags, loc_);
398 	}
399 
400 	bool empty_;
401 	Token current_;
402 	Source source_;
403 	SourceLoc loc_;
404 }