1 module vayne.source.lexer; 2 3 4 import std.array; 5 import std.ascii; 6 import std.exception; 7 import std.format; 8 import std..string; 9 10 11 import vayne.source.source; 12 import vayne.source.token; 13 14 15 16 class LexerException : Exception { 17 this(T)(T* ptr, string msg) { 18 super(msg); 19 this.ptr = cast(void*)ptr; 20 } 21 22 void* ptr; 23 } 24 25 26 struct Lexer { 27 this(Source source, SourceLoc loc) { 28 source_ = source; 29 empty_ = false; 30 loc_ = loc; 31 32 popFront; 33 } 34 35 this(Source source) { 36 this(source, SourceLoc(source.id, 1)); 37 } 38 39 void popFront() { 40 assert(!empty); 41 42 if (current_.kind == Token.Kind.EndOfInput) { 43 empty_ = true; 44 } else { 45 eat; 46 } 47 } 48 49 Token front() const { 50 return current_; 51 } 52 53 bool empty() const { 54 return empty_; 55 } 56 57 private: 58 void eat() { 59 auto ptr = source_.buffer.ptr; 60 scope (exit) source_.ptr = ptr; 61 auto end = source_.end; 62 63 eatSpaces(ptr, end); 64 65 while (ptr != end) { 66 auto start = ptr; 67 auto ch = *ptr++; 68 69 switch(ch) { 70 case '\"': 71 case '\'': 72 case '`': 73 current_ = eatString(ch, ptr, end); 74 return; 75 case '/': 76 if (ptr != end) { 77 if (*ptr == ch) { 78 eatUntil('\n', ptr, end); 79 eatSpaces(ptr, end); 80 continue; 81 } else if (*ptr == '*') { 82 eatBlockComment(ptr, end); 83 eatSpaces(ptr, end); 84 continue; 85 } 86 } 87 current_ = eatSep(ch, ptr - 1, ptr, end); 88 return; 89 case '.': 90 if (ptr != end) { 91 if (*ptr == '.') { 92 ++ptr; 93 if ((ptr != end) && (*ptr == '.')) 94 ++ptr; 95 } else if (isDigit(*ptr)) { 96 current_ = eatNumeric(ch, ptr - 1, ptr, end); 97 return; 98 } 99 } 100 current_ = Token(Token.Kind.Separator, start[0..ptr - start], loc_); 101 return; 102 default: 103 if (isAlpha(ch) || (ch == '_')) { 104 while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_'))) 105 ++ptr; 106 auto length = ptr - start; 107 auto name = start[0..length]; 108 109 switch(length) { 110 case 2: 111 if (name == "in") { 112 current_ = Token(name, Token.KeywordKind.In, 0, loc_); 113 return; 114 } 115 if (name == "as") { 116 current_ = Token(name, Token.KeywordKind.As, 0, loc_); 117 return; 118 } 119 break; 120 case 3: 121 if (name == "def") { 122 current_ = Token(name, Token.KeywordKind.Def, 0, loc_); 123 return; 124 } 125 if (name == "set") { 126 current_ = Token(name, Token.KeywordKind.Set, 0, loc_); 127 return; 128 } 129 if (name == "pop") { 130 current_ = Token(name, Token.KeywordKind.Pop, 0, loc_); 131 return; 132 } 133 break; 134 case 4: 135 if (name == "null") { 136 current_ = Token(name, Token.KeywordKind.Null, 0, loc_); 137 return; 138 } 139 if (name == "true") { 140 current_ = Token(name, Token.KeywordKind.True, 0, loc_); 141 return; 142 } 143 if (name == "push") { 144 current_ = Token(name, Token.KeywordKind.Push, 0, loc_); 145 return; 146 } 147 break; 148 case 5: 149 if (name == "false") { 150 current_ = Token(name, Token.KeywordKind.False, 0, loc_); 151 return; 152 } 153 if (name == "undef") { 154 current_ = Token(name, Token.KeywordKind.Undef, 0, loc_); 155 return; 156 } 157 break; 158 default: 159 break; 160 } 161 current_ = Token(Token.Kind.Identifier, start[0..length], loc_); 162 } else if (isDigit(ch)) { 163 current_ = eatNumeric(ch, ptr - 1, ptr, end); 164 } else { 165 current_ = eatSep(ch, ptr - 1, ptr, end); 166 } 167 return; 168 } 169 } 170 171 if (ptr >= end) 172 current_ = Token(Token.Kind.EndOfInput, null, loc_); 173 } 174 175 void eatSpaces(T)(ref T* ptr, ref T* end) { 176 while ((ptr != end) && isWhite(*ptr)) { 177 if (*ptr == '\n') 178 ++loc_.line; 179 ++ptr; 180 } 181 } 182 183 void eatUntil(T)(T ch, ref T* ptr, T* end) { 184 while ((ptr != end) && (*ptr != ch)) { 185 if (*ptr == '\n') 186 ++loc_.line; 187 ++ptr; 188 } 189 } 190 191 void eatBlockComment(T)(ref T* ptr, T* end) { 192 auto start = ptr; 193 auto opens = 1; 194 195 while (opens) { 196 if (ptr == end) 197 throw new LexerException(start, "unterminated block comment"); 198 199 while((ptr != end) && (*ptr != '/')) { 200 if (*ptr == '\n') 201 ++loc_.line; 202 ++ptr; 203 } 204 205 if (*ptr == '/') { 206 if (*(ptr - 1) == '*') { 207 ++ptr; 208 --opens; 209 } else { 210 ++ptr; 211 if ((ptr != end) && (*ptr == '*')) { 212 ++ptr; 213 ++opens; 214 } 215 } 216 } 217 } 218 } 219 220 Token eatSep(T)(T ch, T* start, ref T* ptr, T* end) { 221 assert(!isWhite(ch)); 222 223 auto doubleEnabled = false; 224 auto equalsEnabled = false; 225 auto doubleEquals = false; 226 227 if (ptr != end) { 228 switch(ch) { 229 case '>': 230 case '<': 231 case '^': 232 doubleEquals = true; 233 doubleEnabled = true; 234 equalsEnabled = true; 235 goto default; 236 case '+': 237 case '-': 238 case '|': 239 case '&': 240 doubleEnabled = true; 241 equalsEnabled = true; 242 goto default; 243 case '*': 244 case '/': 245 case '%': 246 case '!': 247 case '~': 248 case '=': 249 equalsEnabled = true; 250 goto default; 251 default: 252 if (doubleEnabled) { 253 if (*ptr == ch) { 254 ++ptr; 255 if (doubleEquals && (ptr != end)) { 256 assert(doubleEnabled); 257 if (*ptr == '=') 258 ++ptr; 259 } 260 } else if (equalsEnabled && (*ptr == '=')) { 261 ++ptr; 262 } 263 } else if (equalsEnabled) { 264 if (*ptr == '=') 265 ++ptr; 266 } 267 } 268 } 269 270 return Token(Token.Kind.Separator, start[0..ptr - start], loc_); 271 } 272 273 size_t eatSuffix(T)(ref T* ptr, T* end) { 274 auto start = ptr; 275 276 if (ptr != end) { 277 if (isAlpha(*ptr) || (*ptr == '_')) { 278 ++ptr; 279 while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_'))) 280 ++ptr; 281 } 282 } 283 284 return ptr - start; 285 } 286 287 Token eatFloat(T)(T ch, T* start, ref T* ptr, T* end) { 288 auto dot = (ch == '.'); 289 290 while ((ptr != end) && ((isDigit(*ptr) || (*ptr == '\'') || (!dot && (*ptr == '.') && (*(ptr + 1) != '.'))))) { 291 if (*ptr == '.') 292 dot = true; 293 ++ptr; 294 } 295 296 size_t suffixSize; 297 if (ptr != end) { 298 if ((*ptr == 'e') || (*ptr == 'E')) { 299 ++ptr; 300 if (ptr != end) { 301 if ((*ptr == '-') || (*ptr == '+')) 302 ++ptr; 303 while ((ptr != end) && (isDigit(*ptr))) 304 ++ptr; 305 } 306 if (!isDigit(*(ptr - 1))) 307 throw new LexerException(ptr - 1, "invalid exponent in floating-point literal"); 308 } 309 310 suffixSize = eatSuffix(ptr, end); 311 } 312 313 return Token(start[0..((ptr - start) - suffixSize)], (dot ? Token.LiteralKind.Float : Token.LiteralKind.Dec), suffixSize, 0, loc_); 314 } 315 316 Token eatNumeric(T)(T ch, T* start, ref T* ptr, T* end) { 317 if (ch == '0') { 318 auto base = Token.LiteralKind.Dec; 319 320 size_t suffixSize; 321 if (ptr != end) { 322 switch (std.ascii.toLower(*ptr)) { 323 case 'x': 324 if (isHexDigit(*(ptr + 1))) { 325 ++ptr; 326 base = Token.LiteralKind.Hex; 327 while ((ptr != end) && (isHexDigit(*ptr))) 328 ++ptr; 329 } 330 break; 331 case 'b': 332 if ((*(ptr + 1) == '0') || (*(ptr + 1) == '1')) { 333 ++ptr; 334 while ((ptr != end) && ((*ptr == '0') || (*ptr == '1'))) 335 ++ptr; 336 base = Token.LiteralKind.Bin; 337 } 338 break; 339 case 'o': 340 if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '7')) { 341 ++ptr; 342 while ((ptr != end) && (*ptr >= '0') && (*ptr <= '7')) 343 ++ptr; 344 base = Token.LiteralKind.Oct; 345 } 346 break; 347 case '.': 348 return eatFloat(ch, start, ptr, end); 349 case 'd': 350 if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '9')) { 351 start = ptr + 1; 352 ++ptr; 353 } 354 goto default; 355 default: 356 while ((ptr != end) && isDigit(*ptr)) 357 ++ptr; 358 break; 359 } 360 361 suffixSize = eatSuffix(ptr, end); 362 } 363 return Token(start[0..((ptr - start) - suffixSize)], base, suffixSize, 0, loc_); 364 } else { 365 return eatFloat(ch, start, ptr, end); 366 } 367 } 368 369 Token eatString(immutable(char) ch, ref immutable(char)* ptr, immutable(char)* end) { 370 auto start = ptr; 371 auto needsUnescaping = false; 372 373 while ((ptr != end) && (*ptr != ch)) { 374 if (*ptr != '\\') { 375 if (*ptr == '\n') 376 ++loc_.line; 377 ++ptr; 378 } else { 379 ++ptr; 380 if (ptr != end) { 381 needsUnescaping = true; 382 ++ptr; 383 } 384 } 385 } 386 387 if (*ptr != ch) 388 throw new LexerException(start, "unterminated string-literal"); 389 390 ++ptr; 391 size_t flags = 0; 392 if (needsUnescaping && (ch != '`')) 393 flags |= Token.Flags.NeedsUnescaping; 394 395 auto suffixSize = eatSuffix(ptr, end); 396 397 return Token(start[0..(ptr - start - suffixSize - 1)], (((ch == '\"') || (ch == '`')) ? Token.LiteralKind.String : Token.LiteralKind.Char), suffixSize, flags, loc_); 398 } 399 400 bool empty_; 401 Token current_; 402 Source source_; 403 SourceLoc loc_; 404 }