ScriptStack 1.0.5
Loading...
Searching...
No Matches
Lexer.cs
Go to the documentation of this file.
1using System;
2using System.Collections.Generic;
3using System.Text;
4
6{
7
20 public class Lexer
21 {
22
23 #region Private Enumerated Types
24
60
61 #endregion
62
63 #region public variables
64
71
72 public DefaultRealType DefaultReal { get; set; } = DefaultRealType.Decimal;
73
74 #endregion
75
76 #region Private Variables
77
78 private List<string> lines;
79 private int line;
80 private int column;
81 private State state;
82
83 // Multiline strings
84 private int stringStartLine;
85 private int stringStartColumn;
86 private string stringStartLineText;
87
88 #endregion
89
90 #region Private Methods
91
92 private void InvalidCharacter(char ch)
93 {
94 throw new LexerException("Unerwartetes Zeichen '" + ch + "'.\n", line, column, lines[line]);
95 }
96
97 private bool EndOfSource
98 {
99 get { return line >= lines.Count; }
100 }
101
102 private char ReadChar()
103 {
104
105 if (EndOfSource)
106 throw new LexerException("Das Ende des TokenStream wurde erreicht.");
107
108 char ch = lines[line][column++];
109
110 if (column >= lines[line].Length)
111 {
112
113 column = 0;
114
115 ++line;
116
117 }
118
119 return ch;
120 }
121
122 private void UndoChar()
123 {
124
125 if (line == 0 && column == 0)
126 throw new LexerException("Der Anfang des TokenStream wurde erreicht.");
127
128 --column;
129
130 if (column < 0)
131 {
132
133 --line;
134
135 column = lines[line].Length - 1;
136
137 }
138
139 }
140
141 #endregion
142
143 #region Public Methods
144
145 public Lexer(List<string> lines)
146 {
147
148 this.lines = new List<string>();
149
150 foreach (string line in lines)
151 this.lines.Add(line + "\r\n");
152
153 line = 0;
154
155 column = 0;
156
157 state = State.None;
158
159 }
160
161 private bool TryReadChar(out char ch)
162 {
163 if (EndOfSource)
164 {
165 ch = '\0';
166 return false;
167 }
168
169 ch = ReadChar();
170 return true;
171 }
172
177 public List<Token> GetTokens()
178 {
179
180 line = 0;
181
182 column = 0;
183
184 state = State.None;
185
186 string lexeme = null;
187
188 List<Token> tokenStream = new List<Token>();
189
190 while (!EndOfSource)
191 {
192
193 string currentLine = lines[line];
194
195 char ch = ReadChar();
196
197 switch (state)
198 {
199
200 case State.None:
201 switch (ch)
202 {
203
204 case ' ':
205 case '\t':
206 case '\r':
207 case '\n':
208 break;
209
210 case '(':
211 tokenStream.Add(new Token(TokenType.LeftParen, "(", line, column, currentLine));
212 break;
213 case ')':
214 tokenStream.Add(new Token(TokenType.RightParen, ")", line, column, currentLine));
215 break;
216 case '[':
217 tokenStream.Add(new Token(TokenType.LeftBracket, "[", line, column, currentLine));
218 break;
219 case ']':
220 tokenStream.Add(new Token(TokenType.RightBracket, "]", line, column, currentLine));
221 break;
222 case '{':
223 tokenStream.Add(new Token(TokenType.LeftBrace, "{", line, column, currentLine));
224 break;
225 case '}':
226 tokenStream.Add(new Token(TokenType.RightBrace, "}", line, column, currentLine));
227 break;
228 case '.':
229 tokenStream.Add(new Token(TokenType.Period, ".", line, column, currentLine));
230 break;
231 case ':':
232 tokenStream.Add(new Token(TokenType.Colon, ":", line, column, currentLine));
233 break;
234 case ',':
235 tokenStream.Add(new Token(TokenType.Comma, ",", line, column, currentLine));
236 break;
237 case ';':
238 tokenStream.Add(new Token(TokenType.SemiColon, ";", line, column, currentLine));
239 break;
240
241
242 case '=':
243 state = State.Assign;
244 break;
245 case '+':
246 state = State.Plus;
247 break;
248 case '-':
249 state = State.Minus;
250 break;
251 case '*':
252 state = State.Multiply;
253 break;
254 case '/':
255 state = State.Divide;
256 break;
257 case '%':
258 state = State.Modulo;
259 break;
260 case '^':
261 state = State.Xor;
262 break;
263 case '&':
264 state = State.And;
265 break;
266 case '|':
267 state = State.Or;
268 break;
269 case '!':
270 state = State.Not;
271 break;
272 case '>':
273 state = State.Greater;
274 break;
275 case '<':
276 state = State.Less;
277 break;
278 case '\"':
279 {
280 // Startposition sauber merken (wir sind gerade 1 Zeichen drüber)
281 UndoChar();
285 ReadChar(); // das " nochmal konsumieren
286
287 lexeme = "";
288
289 // Lookahead: """ ?
290 if (!TryReadChar(out char n1))
291 {
292 // " am Ende der Datei => unterminiert
293 state = State.String;
294 break;
295 }
296
297 if (n1 == '\"')
298 {
299 if (!TryReadChar(out char n2))
300 {
301 // "" am Ende => leerer String
302 tokenStream.Add(new Token(TokenType.String, "", stringStartLine, stringStartColumn, stringStartLineText));
303 state = State.None;
304 break;
305 }
306
307 if (n2 == '\"')
308 {
309 // """ => MultiLineString Start
310 state = State.MultiLineString;
311 }
312 else
313 {
314 // "" => leerer String, n2 gehört schon zum nächsten Token
315 UndoChar(); // n2 zurück
316 tokenStream.Add(new Token(TokenType.String, "", stringStartLine, stringStartColumn, stringStartLineText));
317 state = State.None;
318 }
319 }
320 else
321 {
322 // normaler "..." String, n1 ist erstes Zeichen im String
323 UndoChar(); // n1 zurück
324 state = State.String;
325 }
326
327 break;
328 }
329 case '\'':
330 lexeme = "";
331 state = State.Char;
332 break;
333 case '~':
334 state = State.BinaryNot;
335 break;
336
337 default:
338 if (char.IsLetter(ch) || ch == '_')
339 {
340 state = State.Identifier;
341 lexeme = "" + ch;
342 }
343 else if (char.IsDigit(ch))
344 {
345 lexeme = "" + ch;
346 state = State.Number;
347 }
348 else
350 break;
351
352 }
353
354 break;
355
356 case State.BinaryNot:
357 if (ch == '=')
358 {
359 tokenStream.Add(new Token(TokenType.AssignBinaryNot, "~=", line, column, currentLine));
360 state = State.None;
361 }
362 else
363 {
364 tokenStream.Add(new Token(TokenType.BinaryNot, "~", line, column, currentLine));
365 UndoChar();
366 state = State.None;
367 }
368 break;
369
370 case State.Divide:
371 switch (ch)
372 {
373 case '/':
374 state = State.InlineComment;
375 break;
376 case '*':
377 state = State.BlockComment;
378 break;
379 case '=':
380 tokenStream.Add(new Token(TokenType.AssignDivide, "/=", line, column, currentLine));
381 state = State.None;
382 break;
383 default:
384 tokenStream.Add(new Token(TokenType.Divide, "/", line, column, currentLine));
385 UndoChar();
386 state = State.None;
387 break;
388 }
389 break;
390
391 case State.InlineComment:
392 // just read until a new line is encountered
393 if (ch == '\n')
394 state = State.None;
395 break;
396
397 case State.BlockComment:
398 if (ch == '*')
399 {
400
401 char next = ReadChar();
402
403 if (next == '/')
404 {
405 state = State.None;
406 break;
407 }
408
409 }
410 break;
411
412 case State.Assign:
413 if (ch == '=')
414 {
415 tokenStream.Add(new Token(TokenType.Equal, "==", line, column, currentLine));
416 state = State.None;
417 }
418 else
419 {
420 tokenStream.Add(new Token(TokenType.Assign, "=", line, column, currentLine));
421 UndoChar();
422 state = State.None;
423 }
424 break;
425
426 case State.Plus:
427 if (ch == '+')
428 {
429 tokenStream.Add(new Token(TokenType.Increment, "++", line, column, currentLine));
430 state = State.None;
431 }
432 else if (ch == '=')
433 {
434 tokenStream.Add(new Token(TokenType.AssignPlus, "+=", line, column, currentLine));
435 state = State.None;
436 }
437 else
438 {
439 tokenStream.Add(new Token(TokenType.Plus, "+", line, column, currentLine));
440 UndoChar();
441 state = State.None;
442 }
443 break;
444
445 case State.Minus:
446 if (ch == '-')
447 {
448 tokenStream.Add(new Token(TokenType.Decrement, "--", line, column, currentLine));
449 state = State.None;
450 }
451 else if (ch == '=')
452 {
453 tokenStream.Add(new Token(TokenType.AssignMinus, "-=", line, column, currentLine));
454 state = State.None;
455 }
456 else
457 {
458 tokenStream.Add(new Token(TokenType.Minus, "-", line, column, currentLine));
459 UndoChar();
460 state = State.None;
461 }
462 break;
463
464 case State.Multiply:
465 if (ch == '=')
466 {
467 tokenStream.Add(new Token(TokenType.AssignMultiply, "*=", line, column, currentLine));
468 state = State.None;
469 }
470 else
471 {
472 tokenStream.Add(new Token(TokenType.Multiply, "*", line, column, currentLine));
473 UndoChar();
474 state = State.None;
475 }
476 break;
477
478 case State.Xor:
479 if (ch == '=')
480 {
481 tokenStream.Add(new Token(TokenType.AssignXor, "^=", line, column, currentLine));
482 state = State.None;
483 }
484 else
485 {
486 // bitwise XOR '^'
487 tokenStream.Add(new Token(TokenType.Xor, "^", line, column, currentLine));
488 UndoChar();
489 state = State.None;
490 }
491 break;
492
493 case State.Modulo:
494 if (ch == '=')
495 {
496 tokenStream.Add(new Token(TokenType.AssignModulo, "%=", line, column, currentLine));
497 state = State.None;
498 }
499 else
500 {
501 tokenStream.Add(new Token(TokenType.Modulo, "%", line, column, currentLine));
502 UndoChar();
503 state = State.None;
504 }
505 break;
506
507 case State.And:
508 if (ch == '&')
509 {
510 tokenStream.Add(new Token(TokenType.And, "&&", line, column, currentLine));
511 state = State.None;
512 }
513 else if (ch == '=')
514 {
515 tokenStream.Add(new Token(TokenType.AssignBinaryAnd, "&=", line, column, currentLine));
516 state = State.None;
517 }
518 else
519 {
520 // bitwise AND '&'
521 tokenStream.Add(new Token(TokenType.BinaryAnd, "&", line, column, currentLine));
522 UndoChar();
523 state = State.None;
524 }
525 break;
526
527 case State.Or:
528 if (ch == '|')
529 {
530 tokenStream.Add(new Token(TokenType.Or, "||", line, column, currentLine));
531 state = State.None;
532 }
533 else if (ch == '=')
534 {
535 tokenStream.Add(new Token(TokenType.AssignBinaryOr, "|=", line, column, currentLine));
536 state = State.None;
537 }
538 else
539 {
540 // bitwise OR '|'
541 tokenStream.Add(new Token(TokenType.BinaryOr, "|", line, column, currentLine));
542 UndoChar();
543 state = State.None;
544 }
545 break;
546
547 case State.Not:
548 if (ch == '=')
549 {
550 tokenStream.Add(new Token(TokenType.NotEqual, "!=", line, column, currentLine));
551 state = State.None;
552 }
553 else
554 {
555 tokenStream.Add(new Token(TokenType.Not, "!", line, column, currentLine));
556 UndoChar();
557 state = State.None;
558 }
559 break;
560
561 case State.Greater:
562 if (ch == '=')
563 {
564 tokenStream.Add(new Token(TokenType.GreaterEqual, ">=", line, column, currentLine));
565 state = State.None;
566 }
567 else if (ch == '>')
568 {
569 tokenStream.Add(new Token(TokenType.ShiftRight, ">>", line, column, currentLine));
570 state = State.None;
571 }
572 else
573 {
574 tokenStream.Add(new Token(TokenType.Greater, ">", line, column, currentLine));
575 UndoChar();
576 state = State.None;
577 }
578 break;
579
580 case State.Less:
581 if (ch == '=')
582 {
583 tokenStream.Add(new Token(TokenType.LessEqual, "<=", line, column, currentLine));
584 state = State.None;
585 }
586 else if (ch == '<')
587 {
588 tokenStream.Add(new Token(TokenType.ShiftLeft, "<<", line, column, currentLine));
589 state = State.None;
590 }
591 else
592 {
593 tokenStream.Add(new Token(TokenType.Less, "<", line, column, currentLine));
594 UndoChar();
595 state = State.None;
596 }
597 break;
598
599 case State.Identifier:
600
601 if (char.IsLetterOrDigit(ch) || ch == '_')
602 lexeme += ch;
603
604 else
605 {
606
607 TokenType tokenType;
608
609 if (lexeme == "null")
610 tokenType = TokenType.Null;
611 else if (lexeme == "true" || lexeme == "false")
612 tokenType = TokenType.Boolean;
613 else if (lexeme == "if")
614 tokenType = TokenType.If;
615 else if (lexeme == "else")
616 tokenType = TokenType.Else;
617 else if (lexeme == "while")
618 tokenType = TokenType.While;
619 else if (lexeme == "for")
620 tokenType = TokenType.For;
621 else if (lexeme == "foreach")
622 tokenType = TokenType.Foreach;
623 else if (lexeme == "in")
624 tokenType = TokenType.In;
625 else if (lexeme == "switch")
626 tokenType = TokenType.Switch;
627 else if (lexeme == "case")
628 tokenType = TokenType.Case;
629 else if (lexeme == "default")
630 tokenType = TokenType.Default;
631 else if (lexeme == "break")
632 tokenType = TokenType.Break;
633 else if (lexeme == "continue")
634 tokenType = TokenType.Continue;
635 else if (lexeme == "function")
636 tokenType = TokenType.Function;
637 else if (lexeme == "return")
638 tokenType = TokenType.Return;
639
640 else if (lexeme == "shared")
641 tokenType = TokenType.Shared;
642 else if (lexeme == "var")
643 tokenType = TokenType.Var;
644
645 else if (lexeme == "include")
646 tokenType = TokenType.Include;
647 else if (lexeme == "lock")
648 tokenType = TokenType.Lock;
649 else if (lexeme == "run")
650 tokenType = TokenType.Run;
651 else if (lexeme == "yield")
652 tokenType = TokenType.Yield;
653 else if (lexeme == "notify")
654 tokenType = TokenType.Notify;
655 else if (lexeme == "wait")
656 tokenType = TokenType.Wait;
657 else
658 tokenType = TokenType.Identifier;
659
660 if (tokenType == TokenType.Boolean)
661 {
662
663 bool val = false;
664
665 if (lexeme == "true")
666 val = true;
667
668 tokenStream.Add(new Token(tokenType, val, line, column, currentLine));
669
670 }
671
672 else
673 tokenStream.Add(new Token(tokenType, lexeme, line, column, currentLine));
674
675 UndoChar();
676
677 state = State.None;
678
679 }
680 break;
681
682 case State.Char:
683 /* \Todo */
684 while (ch != '\'')
685 {
686 lexeme += ch;
687 ch = ReadChar();
688 }
689 if (ch == '\'')
690 {
691 char c;
692 if (lexeme == "\\n") c = '\n';
693 else if (lexeme == "\\t") c = '\t';
694 else if (lexeme == "\\b") c = '\b';
695 else if (lexeme == "\\r") c = '\r';
696 else if (lexeme == "\\f") c = '\f';
697 else if (lexeme == "\\\'") c = '\'';
698 else if (lexeme == "\\\"") c = '\"';
699 else if (lexeme == "\\\\") c = '\\';
700 else c = char.Parse(lexeme);
701 tokenStream.Add(new Token(TokenType.Char, c, line, column, currentLine));
702 state = State.None;
703 }
704 else
705 throw new LexerException("Ein 'Character' darf genau ein Zeichen lang sein - ausgenommen Steuerzeichen!", line, column, lines[line]);
706 break;
707
708 case State.String:
709 if (ch == '\"') // string is ready!
710 {
711 tokenStream.Add(new Token(TokenType.String, lexeme, line, column, currentLine));
712 state = State.None;
713 }
714 else if (ch == '\\') // escape character, start string escape
715 {
716 state = State.EscapeString;
717 }
718 else if (ch == '\r' || ch == '\n') // if there is actually a line break inside the string..
719 {
720 throw new LexerException("Ein String darf sich nicht auf mehrere Zeilen erstrecken.", line, column, lines[line]);
721 }
722 else // just add the character
723 {
724 lexeme += ch;
725 }
726 break;
727
728 case State.EscapeString:
729 /*
730 * Always return to TokenState.String because we are inside a string!
731 */
732 if (ch == '"')
733 {
734 lexeme += '\"';
735 state = State.String;
736 }
737 else if (ch == '\\')
738 {
739 lexeme += ch;
740 state = State.String;
741 }
742 else if (ch == 'n')
743 {
744 lexeme += '\n';
745 state = State.String;
746 }
747 else if (ch == 't')
748 {
749 lexeme += '\t';
750 state = State.String;
751 }
752 else if (ch == 'r')
753 {
754 lexeme += '\r';
755 state = State.String;
756 }
757 else if (ch == 'n')
758 {
759 lexeme += '\n';
760 state = State.String;
761 }
762 else if (ch == 'b')
763 {
764 lexeme += '\b';
765 state = State.String;
766 }
767 else
768 throw new LexerException("Das Escapezeichen '\\" + ch + "' kann in Strings nicht verarbeitet werden.", line, column, lines[line]);
769
770 break;
771
772 case State.MultiLineString:
773 if (ch == '\"')
774 {
775 state = State.MultiLineStringQuote1;
776 }
777 else if (ch == '\\')
778 {
779 state = State.EscapeMultiLineString;
780 }
781 else if (ch == '\r')
782 {
783 // ignorieren (du hast \r\n in den lines)
784 }
785 else
786 {
787 // inkl. '\n' erlaubt
788 lexeme += ch;
789 }
790 break;
791
792 case State.MultiLineStringQuote1:
793 if (ch == '\"')
794 {
795 state = State.MultiLineStringQuote2;
796 }
797 else
798 {
799 // war kein Abschluss, nur ein "
800 lexeme += '\"';
801 if (ch != '\r') lexeme += ch;
802 state = State.MultiLineString;
803 }
804 break;
805
806 case State.MultiLineStringQuote2:
807 if (ch == '\"')
808 {
809 // """ beendet
810 tokenStream.Add(new Token(TokenType.String, lexeme, stringStartLine, stringStartColumn, stringStartLineText));
811 state = State.None;
812 }
813 else
814 {
815 // war kein Abschluss, nur ""
816 lexeme += "\"\"";
817 if (ch != '\r') lexeme += ch;
818 state = State.MultiLineString;
819 }
820 break;
821
822 case State.EscapeMultiLineString:
823 // gleiche Escape-Logik wie bei normalen Strings, aber Rückkehr zu MultiLineString
824 if (ch == '"') { lexeme += '\"'; state = State.MultiLineString; }
825 else if (ch == '\\') { lexeme += '\\'; state = State.MultiLineString; }
826 else if (ch == 'n') { lexeme += '\n'; state = State.MultiLineString; }
827 else if (ch == 't') { lexeme += '\t'; state = State.MultiLineString; }
828 else if (ch == 'r') { lexeme += '\r'; state = State.MultiLineString; }
829 else if (ch == 'b') { lexeme += '\b'; state = State.MultiLineString; }
830 else
831 throw new LexerException("Das Escapezeichen '\\" + ch + "' kann in Strings nicht verarbeitet werden.", line, column, lines[line]);
832 break;
833
834 case State.Number:
835 /*
836 * In the phase of lexing numbers are also strings
837 * They are casted later on
838 */
839 if (char.IsDigit(ch))
840 lexeme += ch;
841 else if (ch == '.') // culture?!?
842 {
843 lexeme += '.';
844 switch (DefaultReal)
845 {
846 case DefaultRealType.Float:
847 state = State.Float;
848 break;
849 case DefaultRealType.Double:
850 state = State.Double;
851 break;
852 case DefaultRealType.Decimal:
853 default:
854 state = State.Decimal;
855 break;
856 }
857 }
858 else if (ch == 'x')
859 {
860 lexeme += ch;
861 state = State.Hex;
862 }
863 else if (ch == 'b')
864 {
865 int intValue = Convert.ToInt32(lexeme, 2);
866 // \todo in fact this is a 32 bit integer
867 tokenStream.Add(new Token(TokenType.Integer, intValue, line, column, currentLine));
868 state = State.None;
869 }
870 else if (ch == 'o')
871 {
872 int intValue = Convert.ToInt32(lexeme, 8);
873 tokenStream.Add(new Token(TokenType.Integer, intValue, line, column, currentLine));
874 state = State.None;
875 }
876 else
877 {
878 int intValue = int.Parse(lexeme);
879 tokenStream.Add(new Token(TokenType.Integer, intValue, line, column, currentLine));
880 UndoChar();
881 state = State.None;
882 }
883 break;
884
885 case State.Float:
886 if (char.IsDigit(ch))
887 lexeme += ch;
888 else
889 {
890 float floatValue = float.Parse(lexeme, System.Globalization.CultureInfo.InvariantCulture);
891 tokenStream.Add(new Token(TokenType.Float, floatValue, line, column, currentLine));
892 UndoChar();
893 state = State.None;
894 }
895 break;
896
897 case State.Double:
898 if (char.IsDigit(ch))
899 lexeme += ch;
900 else
901 {
902 double doubleValue = double.Parse(lexeme, System.Globalization.CultureInfo.InvariantCulture);
903 tokenStream.Add(new Token(TokenType.Double, doubleValue, line, column, currentLine));
904 UndoChar();
905 state = State.None;
906 }
907 break;
908
909 case State.Decimal:
910 if (char.IsDigit(ch))
911 lexeme += ch;
912 else if (ch == 'f')
913 state = State.Float;
914 else if (ch == 'd')
915 state = State.Double;
916 else
917 {
918 decimal decimalValue = decimal.Parse(lexeme, System.Globalization.CultureInfo.InvariantCulture);
919 tokenStream.Add(new Token(TokenType.Decimal, decimalValue, line, column, currentLine));
920 UndoChar();
921 state = State.None;
922 }
923 break;
924
925 case State.Hex:
926 if (char.IsDigit(ch) || char.IsLetter(ch))
927 {
928 if (char.IsLetter(ch) && !(ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'))
929 throw new LexerException("Ein hexadezimaler Wert darf ausser Zahlen nur Buchstaben von 'a' - 'f' bzw. 'A' - 'F' enthalten.", line, column, currentLine);
930 lexeme += ch;
931 }
932 else
933 {
934 int intValue = Convert.ToInt32(lexeme, 16);
935 tokenStream.Add(new Token(TokenType.Integer, intValue, line, column, currentLine));
936 UndoChar();
937 state = State.None;
938 }
939 break;
940
941 default:
942 throw new LexerException("Unbekannter Lexer Status '" + state + "'.");
943
944 }
945
946 }
947
948 if (state != State.None)
949 throw new LexerException("Unerwartetes Ende des TokenStream.");
950
951 return tokenStream;
952
953 }
954
955 #endregion
956
957 }
958
959}
bool TryReadChar(out char ch)
Definition Lexer.cs:161
Lexer(List< string > lines)
Definition Lexer.cs:145
void InvalidCharacter(char ch)
Definition Lexer.cs:92
List< Token > GetTokens()
Definition Lexer.cs:177
List< string > lines
Definition Lexer.cs:78
DefaultRealType DefaultReal
Definition Lexer.cs:72
A lexical token or simply token is a string with an assigned and thus identified meaning.
Definition Token.cs:101
TokenType
Known types of Token.
Definition Token.cs:12