1 module vayne.source.lexer; 2 3 4 import std.array; 5 import std.ascii; 6 import std.exception; 7 import std.format; 8 import std.string; 9 10 11 import vayne.source.source; 12 import vayne.source.token; 13 14 15 16 class LexerException : Exception { 17 this(T)(T* ptr, string msg) { 18 super(msg); 19 this.ptr = cast(void*)ptr; 20 } 21 22 void* ptr; 23 } 24 25 26 struct Lexer { 27 this(Source source, SourceLoc loc) { 28 source_ = source; 29 empty_ = false; 30 loc_ = loc; 31 32 popFront; 33 } 34 35 this(Source source) { 36 this(source, SourceLoc(source.id, 1)); 37 } 38 39 void popFront() { 40 assert(!empty); 41 42 if (current_.kind == Token.Kind.EndOfInput) { 43 empty_ = true; 44 } else { 45 eat; 46 } 47 } 48 49 Token front() const { 50 return current_; 51 } 52 53 bool empty() const { 54 return empty_; 55 } 56 57 private: 58 void eat() { 59 auto ptr = source_.buffer.ptr; 60 scope (exit) source_.ptr = ptr; 61 auto end = source_.end; 62 63 eatSpaces(ptr, end); 64 65 while (ptr != end) { 66 auto start = ptr; 67 auto ch = *ptr++; 68 69 switch(ch) { 70 case '\"': 71 case '\'': 72 case '`': 73 current_ = eatString(ch, ptr, end); 74 return; 75 case '/': 76 if (ptr != end) { 77 if (*ptr == ch) { 78 eatUntil('\n', ptr, end); 79 eatSpaces(ptr, end); 80 continue; 81 } else if (*ptr == '*') { 82 eatBlockComment(ptr, end); 83 eatSpaces(ptr, end); 84 continue; 85 } 86 } 87 current_ = eatSep(ch, ptr - 1, ptr, end); 88 return; 89 case '.': 90 if (ptr != end) { 91 if (*ptr == '.') { 92 ++ptr; 93 if ((ptr != end) && (*ptr == '.')) 94 ++ptr; 95 } else if (isDigit(*ptr)) { 96 current_ = eatNumeric(ch, ptr - 1, ptr, end); 97 return; 98 } 99 } 100 current_ = Token(Token.Kind.Separator, start[0..ptr - start], loc_); 101 return; 102 default: 103 if (isAlpha(ch) || (ch == '_')) { 104 while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_'))) 105 ++ptr; 106 auto length = ptr - start; 107 auto name = start[0..length]; 108 109 switch(length) { 110 case 2: 111 if (name == "in") { 112 current_ = Token(name, Token.KeywordKind.In, 0, loc_); 113 return; 114 } 115 if (name == "as") { 116 current_ = Token(name, Token.KeywordKind.As, 0, loc_); 117 return; 118 } 119 break; 120 case 3: 121 if (name == "def") { 122 current_ = Token(name, Token.KeywordKind.Def, 0, loc_); 123 return; 124 } 125 break; 126 case 4: 127 if (name == "null") { 128 current_ = Token(name, Token.KeywordKind.Null, 0, loc_); 129 return; 130 } 131 if (name == "true") { 132 current_ = Token(name, Token.KeywordKind.True, 0, loc_); 133 return; 134 } 135 break; 136 case 5: 137 if (name == "false") { 138 current_ = Token(name, Token.KeywordKind.False, 0, loc_); 139 return; 140 } 141 if (name == "undef") { 142 current_ = Token(name, Token.KeywordKind.Undef, 0, loc_); 143 return; 144 } 145 break; 146 default: 147 break; 148 } 149 current_ = Token(Token.Kind.Identifier, start[0..length], loc_); 150 } else if (isDigit(ch)) { 151 current_ = eatNumeric(ch, ptr - 1, ptr, end); 152 } else { 153 current_ = eatSep(ch, ptr - 1, ptr, end); 154 } 155 return; 156 } 157 } 158 159 if (ptr >= end) 160 current_ = Token(Token.Kind.EndOfInput, null, loc_); 161 } 162 163 void eatSpaces(T)(ref T* ptr, ref T* end) { 164 while ((ptr != end) && isWhite(*ptr)) { 165 if (*ptr == '\n') 166 ++loc_.line; 167 ++ptr; 168 } 169 } 170 171 void eatUntil(T)(T ch, ref T* ptr, T* end) { 172 while ((ptr != end) && (*ptr != ch)) { 173 if (*ptr == '\n') 174 ++loc_.line; 175 ++ptr; 176 } 177 } 178 179 void eatBlockComment(T)(ref T* ptr, T* end) { 180 auto start = ptr; 181 auto opens = 1; 182 183 while (opens) { 184 if (ptr == end) 185 throw new LexerException(start, "unterminated block comment"); 186 187 while((ptr != end) && (*ptr != '/')) { 188 if (*ptr == '\n') 189 ++loc_.line; 190 ++ptr; 191 } 192 193 if (*ptr == '/') { 194 if (*(ptr - 1) == '*') { 195 ++ptr; 196 --opens; 197 } else { 198 ++ptr; 199 if ((ptr != end) && (*ptr == '*')) { 200 ++ptr; 201 ++opens; 202 } 203 } 204 } 205 } 206 } 207 208 Token eatSep(T)(T ch, T* start, ref T* ptr, T* end) { 209 assert(!isWhite(ch)); 210 211 auto doubleEnabled = false; 212 auto equalsEnabled = false; 213 auto doubleEquals = false; 214 215 if (ptr != end) { 216 switch(ch) { 217 case '>': 218 case '<': 219 case '^': 220 doubleEquals = true; 221 doubleEnabled = true; 222 equalsEnabled = true; 223 goto default; 224 case '+': 225 case '-': 226 case '|': 227 case '&': 228 doubleEnabled = true; 229 equalsEnabled = true; 230 goto default; 231 case '*': 232 case '/': 233 case '%': 234 case '!': 235 case '~': 236 case '=': 237 equalsEnabled = true; 238 goto default; 239 default: 240 if (doubleEnabled) { 241 if (*ptr == ch) { 242 ++ptr; 243 if (doubleEquals && (ptr != end)) { 244 assert(doubleEnabled); 245 if (*ptr == '=') 246 ++ptr; 247 } 248 } else if (equalsEnabled && (*ptr == '=')) { 249 ++ptr; 250 } 251 } else if (equalsEnabled) { 252 if (*ptr == '=') 253 ++ptr; 254 } 255 } 256 } 257 258 return Token(Token.Kind.Separator, start[0..ptr - start], loc_); 259 } 260 261 size_t eatSuffix(T)(ref T* ptr, T* end) { 262 auto start = ptr; 263 264 if (ptr != end) { 265 if (isAlpha(*ptr) || (*ptr == '_')) { 266 ++ptr; 267 while ((ptr != end) && (isAlphaNum(*ptr) || (*ptr == '_'))) 268 ++ptr; 269 } 270 } 271 272 return ptr - start; 273 } 274 275 Token eatFloat(T)(T ch, T* start, ref T* ptr, T* end) { 276 auto dot = (ch == '.'); 277 278 while ((ptr != end) && ((isDigit(*ptr) || (*ptr == '\'') || (!dot && (*ptr == '.') && (*(ptr + 1) != '.'))))) { 279 if (*ptr == '.') 280 dot = true; 281 ++ptr; 282 } 283 284 size_t suffixSize; 285 if (ptr != end) { 286 if ((*ptr == 'e') || (*ptr == 'E')) { 287 ++ptr; 288 if (ptr != end) { 289 if ((*ptr == '-') || (*ptr == '+')) 290 ++ptr; 291 while ((ptr != end) && (isDigit(*ptr))) 292 ++ptr; 293 } 294 if (!isDigit(*(ptr - 1))) 295 throw new LexerException(ptr - 1, "invalid exponent in floating-point literal"); 296 } 297 298 suffixSize = eatSuffix(ptr, end); 299 } 300 301 return Token(start[0..((ptr - start) - suffixSize)], (dot ? Token.LiteralKind.Float : Token.LiteralKind.Dec), suffixSize, 0, loc_); 302 } 303 304 Token eatNumeric(T)(T ch, T* start, ref T* ptr, T* end) { 305 if (ch == '0') { 306 auto base = Token.LiteralKind.Dec; 307 308 size_t suffixSize; 309 if (ptr != end) { 310 switch (std.ascii.toLower(*ptr)) { 311 case 'x': 312 if (isHexDigit(*(ptr + 1))) { 313 ++ptr; 314 base = Token.LiteralKind.Hex; 315 while ((ptr != end) && (isHexDigit(*ptr))) 316 ++ptr; 317 } 318 break; 319 case 'b': 320 if ((*(ptr + 1) == '0') || (*(ptr + 1) == '1')) { 321 ++ptr; 322 while ((ptr != end) && ((*ptr == '0') || (*ptr == '1'))) 323 ++ptr; 324 base = Token.LiteralKind.Bin; 325 } 326 break; 327 case 'o': 328 if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '7')) { 329 ++ptr; 330 while ((ptr != end) && (*ptr >= '0') && (*ptr <= '7')) 331 ++ptr; 332 base = Token.LiteralKind.Oct; 333 } 334 break; 335 case '.': 336 return eatFloat(ch, start, ptr, end); 337 case 'd': 338 if ((*(ptr + 1) >= '0') && (*(ptr + 1) <= '9')) { 339 start = ptr + 1; 340 ++ptr; 341 } 342 goto default; 343 default: 344 while ((ptr != end) && isDigit(*ptr)) 345 ++ptr; 346 break; 347 } 348 349 suffixSize = eatSuffix(ptr, end); 350 } 351 return Token(start[0..((ptr - start) - suffixSize)], base, suffixSize, 0, loc_); 352 } else { 353 return eatFloat(ch, start, ptr, end); 354 } 355 } 356 357 Token eatString(immutable(char) ch, ref immutable(char)* ptr, immutable(char)* end) { 358 auto start = ptr; 359 auto needsUnescaping = false; 360 361 while ((ptr != end) && (*ptr != ch)) { 362 if (*ptr != '\\') { 363 if (*ptr == '\n') 364 ++loc_.line; 365 ++ptr; 366 } else { 367 ++ptr; 368 if (ptr != end) { 369 needsUnescaping = true; 370 ++ptr; 371 } 372 } 373 } 374 375 if (*ptr != ch) 376 throw new LexerException(start, "unterminated string-literal"); 377 378 ++ptr; 379 size_t flags = 0; 380 if (needsUnescaping && (ch != '`')) 381 flags |= Token.Flags.NeedsUnescaping; 382 383 auto suffixSize = eatSuffix(ptr, end); 384 385 return Token(start[0..(ptr - start - suffixSize - 1)], (((ch == '\"') || (ch == '`')) ? Token.LiteralKind.String : Token.LiteralKind.Char), suffixSize, flags, loc_); 386 } 387 388 bool empty_; 389 Token current_; 390 Source source_; 391 SourceLoc loc_; 392 }