1 module vayne.source.lexer;
2 
3 
4 import std.array;
5 import std.ascii;
6 import std.exception;
7 import std.format;
8 import std.string;
9 
10 
11 import vayne.source.source;
12 import vayne.source.token;
13 
14 
15 
16 class LexerException : Exception {
17 	this(T)(T* ptr, string msg) {
18 		super(msg);
19 		this.ptr = cast(void*)ptr;
20 	}
21 
22 	void* ptr;
23 }
24 
25 
26 struct Lexer {
27 	this(Source source, SourceLoc loc) {
28 		source_ = source;
29 		empty_ = false;
30 		loc_ = loc;
31 
32 		popFront;
33 	}
34 
35 	this(Source source) {
36 		this(source, SourceLoc(source.id, 1));
37 	}
38 
39 	void popFront() {
40 		assert(!empty);
41 
42 		if (current_.kind == Token.Kind.EndOfInput) {
43 			empty_ = true;
44 		} else {
45 			eat;
46 		}
47 	}
48 
49 	Token front() const {
50 		return current_;
51 	}
52 
53 	bool empty() const {
54 		return empty_;
55 	}
56 
57 private:
58 	void eat() {
59 		auto ptr = source_.buffer.ptr;
60 		scope (exit) source_.ptr = ptr;
61 		auto end = source_.end;
62 
63 		eatSpaces(ptr, end);
64 
65 		while (ptr != end) {
66 			auto start = ptr;
67 			auto ch = *ptr++;
68 
69 			switch(ch) {
70 			case '\"':
71 			case '\'':
72 			case '`':
73 				current_ = eatString(ch, ptr, end);
74 				return;
75 			case '/':
76 				if (ptr != end) {
77 					if (*ptr == ch) {
78 						eatUntil('\n', ptr, end);
79 						eatSpaces(ptr, end);
80 						continue;
81 					} else if (*ptr == '*') {
82 						eatBlockComment(ptr, end);
83 						eatSpaces(ptr, end);
84 						continue;
85 					}
86 				}
87 				current_ = eatSep(ch, ptr - 1, ptr, end);
88 				return;
89 			case '.':
90 				if (ptr != end) {
91 					if (*ptr == '.') {
92 						++ptr;
93 						if ((ptr != end) && (*ptr == '.'))
94 							++ptr;
95 					} else if (isDigit(*ptr)) {
96 						current_ = eatNumeric(ch, ptr - 1, ptr, end);
97 						return;
98 					}
99 				}
100 				current_ = Token(Token.Kind.Separator, start[0..ptr - start], loc_);
101 				return;
102 			default:
103 				if (isAlpha(ch) || (ch == '_')) {
104 					while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_')))
105 						++ptr;
106 					auto length = ptr - start;
107 					auto name = start[0..length];
108 
109 					switch(length) {
110 					case 2:
111 						if (name == "in") {
112 							current_ = Token(name, Token.KeywordKind.In, 0, loc_);
113 							return;
114 						}
115 						if (name == "as") {
116 							current_ = Token(name, Token.KeywordKind.As, 0, loc_);
117 							return;
118 						}
119 						break;
120 					case 3:
121 						if (name == "def") {
122 							current_ = Token(name, Token.KeywordKind.Def, 0, loc_);
123 							return;
124 						}
125 						break;
126 					case 4:
127 						if (name == "null") {
128 							current_ = Token(name, Token.KeywordKind.Null, 0, loc_);
129 							return;
130 						}
131 						if (name == "true") {
132 							current_ = Token(name, Token.KeywordKind.True, 0, loc_);
133 							return;
134 						}
135 						break;
136 					case 5:
137 						if (name == "false") {
138 							current_ = Token(name, Token.KeywordKind.False, 0, loc_);
139 							return;
140 						}
141 						if (name == "undef") {
142 							current_ = Token(name, Token.KeywordKind.Undef, 0, loc_);
143 							return;
144 						}
145 						break;
146 					default:
147 						break;
148 					}
149 					current_ = Token(Token.Kind.Identifier, start[0..length], loc_);
150 				} else if (isDigit(ch)) {
151 					current_ = eatNumeric(ch, ptr - 1, ptr, end);
152 				} else {
153 					current_ = eatSep(ch, ptr - 1, ptr, end);
154 				}
155 				return;
156 			}
157 		}
158 
159 		if (ptr >= end)
160 			current_ = Token(Token.Kind.EndOfInput, null, loc_);
161 	}
162 
163 	void eatSpaces(T)(ref T* ptr, ref T* end) {
164 		while ((ptr != end) && isWhite(*ptr)) {
165 			if (*ptr == '\n')
166 				++loc_.line;
167 			++ptr;
168 		}
169 	}
170 
171 	void eatUntil(T)(T ch, ref T* ptr, T* end) {
172 		while ((ptr != end) && (*ptr != ch)) {
173 			if (*ptr == '\n')
174 				++loc_.line;
175 			++ptr;
176 		}
177 	}
178 
179 	void eatBlockComment(T)(ref T* ptr, T* end) {
180 		auto start = ptr;
181 		auto opens = 1;
182 
183 		while (opens) {
184 			if (ptr == end)
185 				throw new LexerException(start, "unterminated block comment");
186 
187 			while((ptr != end) && (*ptr != '/')) {
188 				if (*ptr == '\n')
189 					++loc_.line;
190 				++ptr;
191 			}
192 
193 			if (*ptr == '/') {
194 				if (*(ptr - 1) == '*') {
195 					++ptr;
196 					--opens;
197 				} else {
198 					++ptr;
199 					if ((ptr != end) && (*ptr == '*')) {
200 						++ptr;
201 						++opens;
202 					}
203 				}
204 			}
205 		}
206 	}
207 
208 	Token eatSep(T)(T ch, T* start, ref T* ptr, T* end) {
209 		assert(!isWhite(ch));
210 
211 		auto doubleEnabled = false;
212 		auto equalsEnabled = false;
213 		auto doubleEquals = false;
214 
215 		if (ptr != end) {
216 			switch(ch) {
217 			case '>':
218 			case '<':
219 			case '^':
220 				doubleEquals = true;
221 				doubleEnabled = true;
222 				equalsEnabled = true;
223 				goto default;
224 			case '+':
225 			case '-':
226 			case '|':
227 			case '&':
228 				doubleEnabled = true;
229 				equalsEnabled = true;
230 				goto default;
231 			case '*':
232 			case '/':
233 			case '%':
234 			case '!':
235 			case '~':
236 			case '=':
237 				equalsEnabled = true;
238 				goto default;
239 			default:
240 				if (doubleEnabled) {
241 					if (*ptr == ch) {
242 						++ptr;
243 						if (doubleEquals && (ptr != end)) {
244 							assert(doubleEnabled);
245 							if (*ptr == '=')
246 								++ptr;
247 						}
248 					} else if (equalsEnabled && (*ptr == '=')) {
249 						++ptr;
250 					}
251 				} else if (equalsEnabled) {
252 					if (*ptr == '=')
253 						++ptr;
254 				}
255 			}
256 		}
257 
258 		return Token(Token.Kind.Separator, start[0..ptr - start], loc_);
259 	}
260 
261 	size_t eatSuffix(T)(ref T* ptr, T* end) {
262 		auto start = ptr;
263 
264 		if (ptr != end) {
265 			if (isAlpha(*ptr) || (*ptr == '_')) {
266 				++ptr;
267 				while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_')))
268 					++ptr;
269 			}
270 		}
271 
272 		return ptr - start;
273 	}
274 
275 	Token eatFloat(T)(T ch, T* start, ref T* ptr, T* end) {
276 		auto dot = (ch == '.');
277 
278 		while ((ptr != end) && ((isDigit(*ptr) || (*ptr == '\'') || (!dot && (*ptr == '.') && (*(ptr + 1) != '.'))))) {
279 			if (*ptr == '.')
280 				dot = true;
281 			++ptr;
282 		}
283 
284 		size_t suffixSize;
285 		if (ptr != end) {
286 			if ((*ptr == 'e') || (*ptr == 'E')) {
287 				++ptr;
288 				if (ptr != end) {
289 					if ((*ptr == '-') || (*ptr == '+'))
290 						++ptr;
291 					while ((ptr != end) && (isDigit(*ptr)))
292 						++ptr;
293 				}
294 				if (!isDigit(*(ptr - 1)))
295 					throw new LexerException(ptr - 1, "invalid exponent in floating-point literal");
296 			}
297 
298 			suffixSize = eatSuffix(ptr, end);
299 		}
300 
301 		return Token(start[0..((ptr - start) - suffixSize)], (dot ? Token.LiteralKind.Float : Token.LiteralKind.Dec), suffixSize, 0, loc_);
302 	}
303 
304 	Token eatNumeric(T)(T ch, T* start, ref T* ptr, T* end) {
305 		if (ch == '0') {
306 			auto base = Token.LiteralKind.Dec;
307 
308 			size_t suffixSize;
309 			if (ptr != end) {
310 				switch (std.ascii.toLower(*ptr)) {
311 				case 'x':
312 					if (isHexDigit(*(ptr + 1))) {
313 						++ptr;
314 						base = Token.LiteralKind.Hex;
315 						while ((ptr != end) && (isHexDigit(*ptr)))
316 							++ptr;
317 					}
318 					break;
319 				case 'b':
320 					if ((*(ptr + 1) == '0') || (*(ptr + 1) == '1')) {
321 						++ptr;
322 						while ((ptr != end) && ((*ptr == '0') || (*ptr == '1')))
323 							++ptr;
324 						base = Token.LiteralKind.Bin;
325 					}
326 					break;
327 				case 'o':
328 					if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '7')) {
329 						++ptr;
330 						while ((ptr != end) && (*ptr >= '0') && (*ptr <= '7'))
331 							++ptr;
332 						base = Token.LiteralKind.Oct;
333 					}
334 					break;
335 				case '.':
336 					return eatFloat(ch, start, ptr, end);
337 				case 'd':
338 					if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '9')) {
339 						start = ptr + 1;
340 						++ptr;
341 					}
342 					goto default;
343 				default:
344 					while ((ptr != end) && isDigit(*ptr))
345 						++ptr;
346 					break;
347 				}
348 
349 				suffixSize = eatSuffix(ptr, end);
350 			}
351 			return Token(start[0..((ptr - start) - suffixSize)], base, suffixSize, 0, loc_);
352 		} else {
353 			return eatFloat(ch, start, ptr, end);
354 		}
355 	}
356 
357 	Token eatString(immutable(char) ch, ref immutable(char)* ptr, immutable(char)* end) {
358 		auto start = ptr;
359 		auto needsUnescaping = false;
360 
361 		while ((ptr != end) && (*ptr != ch)) {
362 			if (*ptr != '\\') {
363 				if (*ptr == '\n')
364 					++loc_.line;
365 				++ptr;
366 			} else {
367 				++ptr;
368 				if (ptr != end) {
369 					needsUnescaping = true;
370 					++ptr;
371 				}
372 			}
373 		}
374 
375 		if (*ptr != ch)
376 			throw new LexerException(start, "unterminated string-literal");
377 
378 		++ptr;
379 		size_t flags = 0;
380 		if (needsUnescaping && (ch != '`'))
381 			flags |= Token.Flags.NeedsUnescaping;
382 
383 		auto suffixSize = eatSuffix(ptr, end);
384 
385 		return Token(start[0..(ptr - start - suffixSize - 1)], (((ch == '\"') || (ch == '`')) ? Token.LiteralKind.String : Token.LiteralKind.Char), suffixSize, flags, loc_);
386 	}
387 
388 	bool empty_;
389 	Token current_;
390 	Source source_;
391 	SourceLoc loc_;
392 }