scanner.cpp
1 //------------------------------------------------------------------------------
2 // scanner.cpp
3 //------------------------------------------------------------------------------
4 //
5 // This library is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU Lesser General Public
7 // License as published by the Free Software Foundation; either
8 // version 2.1 of the License, or (at your option) any later version.
9 //
10 // This library is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public
16 // License along with this library; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 // 02110-1301 USA
19 //
20 //------------------------------------------------------------------------------
21 // Copyright (C) 2010 Braden "Blzut3" Obrzut <admin@maniacsvault.net>
22 //------------------------------------------------------------------------------
23 
24 #include <cmath>
25 #include <cstdio>
26 #include <cstdlib>
27 
28 #include "scanner.h"
29 
30 DClass<Scanner>
31 {
32 public:
33  Scanner::ParserState nextState, prevState, state;
34 
35  char *data;
36  unsigned int length;
37 
38  unsigned int line;
39  unsigned int lineStart;
40  unsigned int logicalPosition;
41  unsigned int scanPos;
42 
43  bool needNext; // If checkToken returns false this will be false.
44 
45  QString scriptIdentifier;
46 };
47 
48 DClass<Scanner::ParserState>
49 {
50 public:
51  QString str;
52  unsigned int number;
53  double decimal;
54  bool boolean;
55  char token;
56  unsigned int tokenLine;
57  unsigned int tokenLinePosition;
58  unsigned int scanPos;
59 };
60 
61 DPointered(Scanner::ParserState)
62 DPointered(Scanner)
63 
64 void (*Scanner::messageHandler)(MessageLevel, const char *, va_list) = nullptr;
65 
66 static const char *const TokenNames[TK_NumSpecialTokens] =
67 {
68  "Identifier",
69  "String Constant",
70  "Integer Constant",
71  "Float Constant",
72  "Boolean Constant",
73  "Logical And",
74  "Logical Or",
75  "Equals",
76  "Not Equals",
77  "Greater Than or Equals"
78  "Less Than or Equals",
79  "Left Shift",
80  "Right Shift",
81  "Increment",
82  "Decrement",
83  "Pointer Member",
84  "Scope Resolution",
85  "Macro Concatenation",
86  "Assign Sum",
87  "Assign Difference",
88  "Assign Product",
89  "Assign Quotient",
90  "Assign Modulus",
91  "Assign Left Shift",
92  "Assign Right Shift",
93  "Assign Bitwise And",
94  "Assign Bitwise Or",
95  "Assign Exclusive Or",
96  "Ellipsis"
97 };
98 
100 
101 Scanner::Scanner(const char *data, int length)
102 {
103  d->line = 1;
104  d->lineStart = 0;
105  d->logicalPosition = 0;
106  d->scanPos = 0;
107  d->needNext = true;
108  if (length == -1)
109  length = strlen(data);
110  d->length = length;
111  d->data = new char[length];
112  memcpy(d->data, data, length);
113 
115 
116  d->state.setScanPos(d->scanPos);
117 }
118 
119 Scanner::~Scanner()
120 {
121  delete[] d->data;
122 }
123 
124 // Here's my answer to the preprocessor screwing up line numbers. What we do is
125 // after a new line in CheckForWhitespace, look for a comment in the form of
126 // "/*meta:filename:line*/"
127 void Scanner::checkForMeta()
128 {
129  if (d->scanPos + 10 < d->length)
130  {
131  char metaCheck[8];
132  memcpy(metaCheck, d->data + d->scanPos, 7);
133  metaCheck[7] = 0;
134  if (strcmp(metaCheck, "/*meta:") == 0)
135  {
136  d->scanPos += 7;
137  int metaStart = d->scanPos;
138  int fileLength = 0;
139  int lineLength = 0;
140  while (d->scanPos < d->length)
141  {
142  char thisChar = d->data[d->scanPos];
143  char nextChar = d->scanPos + 1 < d->length ? d->data[d->scanPos + 1] : 0;
144  if (thisChar == '*' && nextChar == '/')
145  {
146  lineLength = d->scanPos - metaStart - 1 - fileLength;
147  d->scanPos += 2;
148  break;
149  }
150  if (thisChar == ':' && fileLength == 0)
151  fileLength = d->scanPos - metaStart;
152  d->scanPos++;
153  }
154  if (fileLength > 0 && lineLength > 0)
155  {
156  setScriptIdentifier(QString::fromUtf8(d->data + metaStart, fileLength));
157  QString lineNumber = QString::fromUtf8(d->data + metaStart + fileLength + 1, lineLength);
158  d->line = atoi(lineNumber.toUtf8().constData());
159  d->lineStart = d->scanPos;
160  }
161  }
162  }
163 }
164 
166 {
167  int comment = 0; // 1 = till next new line, 2 = till end block
168  while (d->scanPos < d->length)
169  {
170  char cur = d->data[d->scanPos];
171  char next = d->scanPos + 1 < d->length ? d->data[d->scanPos + 1] : 0;
172  if (comment == 2)
173  {
174  if (cur != '*' || next != '/')
175  {
176  if (cur == '\n' || cur == '\r')
177  {
178  d->scanPos++;
179  if (comment == 1)
180  comment = 0;
181 
182  // Do a quick check for Windows style new line
183  if (cur == '\r' && next == '\n')
184  d->scanPos++;
185  incrementLine();
186  }
187  else
188  d->scanPos++;
189  }
190  else
191  {
192  comment = 0;
193  d->scanPos += 2;
194  }
195  continue;
196  }
197 
198  if (cur == ' ' || cur == '\t' || cur == 0)
199  d->scanPos++;
200  else if (cur == '\n' || cur == '\r')
201  {
202  d->scanPos++;
203  if (comment == 1)
204  comment = 0;
205 
206  // Do a quick check for Windows style new line
207  if (cur == '\r' && next == '\n')
208  d->scanPos++;
209  incrementLine();
210  checkForMeta();
211  }
212  else if (cur == '/' && comment == 0)
213  {
214  switch (next)
215  {
216  case '/':
217  comment = 1;
218  break;
219  case '*':
220  comment = 2;
221  break;
222  default:
223  return;
224  }
225  d->scanPos += 2;
226  }
227  else
228  {
229  if (comment == 0)
230  return;
231  else
232  d->scanPos++;
233  }
234  }
235 }
236 
237 bool Scanner::checkToken(char token)
238 {
239  if (d->needNext)
240  {
241  if (!nextToken(false))
242  return false;
243  }
244 
245  // An int can also be a float.
246  if (d->nextState.token() == token || (d->nextState.token() == TK_IntConst && token == TK_FloatConst))
247  {
248  d->needNext = true;
249  expandState();
250  return true;
251  }
252  d->needNext = false;
253  return false;
254 }
255 
256 int Scanner::currentLine() const
257 {
258  return d->state.tokenLine();
259 }
260 
261 int Scanner::currentLinePos() const
262 {
263  return d->state.tokenLinePosition();
264 }
265 
266 int Scanner::currentPos() const
267 {
268  return d->logicalPosition;
269 }
270 
271 unsigned int Scanner::currentScanPos() const
272 {
273  return d->scanPos;
274 }
275 
277 {
278  d->scanPos = d->nextState.scanPos();
279  d->logicalPosition = d->scanPos;
281 
282  d->prevState = d->state;
283  d->state = d->nextState;
284 }
285 
287 {
288  d->line++;
289  d->lineStart = d->scanPos;
290 }
291 
292 bool Scanner::nextString()
293 {
294  d->nextState.setTokenLine(d->line);
295  d->nextState.setTokenLinePosition(d->scanPos - d->lineStart);
296  d->nextState.setToken(TK_NoToken);
297  if (!d->needNext)
298  d->scanPos = d->state.scanPos();
300  if (d->scanPos >= d->length)
301  return false;
302 
303  int start = d->scanPos;
304  int end = d->scanPos;
305  bool quoted = d->data[d->scanPos] == '"';
306  if (quoted) // String Constant
307  {
308  end = ++start; // Remove starting quote
309  d->scanPos++;
310  while (d->scanPos < d->length)
311  {
312  char cur = d->data[d->scanPos];
313  if (cur == '"')
314  end = d->scanPos;
315  else if (cur == '\\')
316  {
317  d->scanPos += 2;
318  continue;
319  }
320  d->scanPos++;
321  if (start != end)
322  break;
323  }
324  }
325  else // Unquoted string
326  {
327  while (d->scanPos < d->length)
328  {
329  char cur = d->data[d->scanPos];
330  switch (cur)
331  {
332  default:
333  break;
334  case ' ':
335  case '\t':
336  case '\n':
337  case '\r':
338  end = d->scanPos;
339  break;
340  }
341  if (start != end)
342  break;
343  d->scanPos++;
344  }
345  if (d->scanPos == d->length)
346  end = d->scanPos;
347  }
348  if (end - start > 0)
349  {
350  d->nextState.setScanPos(d->scanPos);
351  QString thisString = QString::fromUtf8(d->data + start, end - start);
352  if (quoted)
353  unescape(thisString);
354  d->nextState.setStr(thisString);
355  d->nextState.setToken(TK_StringConst);
356  expandState();
357  d->needNext = true;
358  return true;
359  }
361  return false;
362 }
363 
364 bool Scanner::nextToken(bool autoExpandState)
365 {
366  if (!d->needNext)
367  {
368  d->needNext = true;
369  if (autoExpandState)
370  expandState();
371  return true;
372  }
373 
374  d->nextState.setTokenLine(d->line);
375  d->nextState.setTokenLinePosition(d->scanPos - d->lineStart);
376  d->nextState.setToken(TK_NoToken);
377  if (d->scanPos >= d->length)
378  {
379  if (autoExpandState)
380  expandState();
381  return false;
382  }
383 
384  unsigned int start = d->scanPos;
385  unsigned int end = d->scanPos;
386  int integerBase = 10;
387  bool floatHasDecimal = false;
388  bool floatHasExponent = false;
389  bool stringFinished = false; // Strings are the only things that can have 0 length tokens.
390 
391  char cur = d->data[d->scanPos++];
392  // Determine by first character
393  if (cur == '_' || (cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))
394  d->nextState.setToken(TK_Identifier);
395  else if (cur >= '0' && cur <= '9')
396  {
397  if (cur == '0')
398  integerBase = 8;
399  d->nextState.setToken(TK_IntConst);
400  }
401  else if (cur == '.' && d->scanPos < d->length && d->data[d->scanPos] != '.')
402  {
403  floatHasDecimal = true;
404  d->nextState.setToken(TK_FloatConst);
405  }
406  else if (cur == '"')
407  {
408  end = ++start; // Move the start up one character so we don't have to trim it later.
409  d->nextState.setToken(TK_StringConst);
410  }
411  else
412  {
413  end = d->scanPos;
414  d->nextState.setToken(cur);
415 
416  // Now check for operator tokens
417  if (d->scanPos < d->length)
418  {
419  char next = d->data[d->scanPos];
420  if (cur == '&' && next == '&')
421  d->nextState.setToken(TK_AndAnd);
422  else if (cur == '|' && next == '|')
423  d->nextState.setToken(TK_OrOr);
424  else if (
425  (cur == '<' && next == '<') ||
426  (cur == '>' && next == '>')
427  )
428  {
429  // Next for 3 character tokens
430  if (d->scanPos + 1 > d->length && d->data[d->scanPos + 1] == '=')
431  {
432  d->scanPos++;
433  d->nextState.setToken(cur == '<' ? TK_ShiftLeftEq : TK_ShiftRightEq);
434  }
435  else
436  d->nextState.setToken(cur == '<' ? TK_ShiftLeft : TK_ShiftRight);
437  }
438  else if (cur == '#' && next == '#')
439  d->nextState.setToken(TK_MacroConcat);
440  else if (cur == ':' && next == ':')
441  d->nextState.setToken(TK_ScopeResolution);
442  else if (cur == '+' && next == '+')
443  d->nextState.setToken(TK_Increment);
444  else if (cur == '-')
445  {
446  if (next == '-')
447  d->nextState.setToken(TK_Decrement);
448  else if (next == '>')
449  d->nextState.setToken(TK_PointerMember);
450  }
451  else if (cur == '.' && next == '.' &&
452  d->scanPos + 1 < d->length && d->data[d->scanPos + 1] == '.')
453  {
454  d->nextState.setToken(TK_Ellipsis);
455  ++d->scanPos;
456  }
457  else if (next == '=')
458  {
459  switch (cur)
460  {
461  case '=':
462  d->nextState.setToken(TK_EqEq);
463  break;
464  case '!':
465  d->nextState.setToken(TK_NotEq);
466  break;
467  case '>':
468  d->nextState.setToken(TK_GtrEq);
469  break;
470  case '<':
471  d->nextState.setToken(TK_LessEq);
472  break;
473  case '+':
474  d->nextState.setToken(TK_AddEq);
475  break;
476  case '-':
477  d->nextState.setToken(TK_SubEq);
478  break;
479  case '*':
480  d->nextState.setToken(TK_MulEq);
481  break;
482  case '/':
483  d->nextState.setToken(TK_DivEq);
484  break;
485  case '%':
486  d->nextState.setToken(TK_ModEq);
487  break;
488  case '&':
489  d->nextState.setToken(TK_AndEq);
490  break;
491  case '|':
492  d->nextState.setToken(TK_OrEq);
493  break;
494  case '^':
495  d->nextState.setToken(TK_XorEq);
496  break;
497  default:
498  break;
499  }
500  }
501 
502  if (d->nextState.token() != cur)
503  {
504  d->scanPos++;
505  end = d->scanPos;
506  }
507  }
508  }
509 
510  if (start == end)
511  {
512  while (d->scanPos < d->length)
513  {
514  cur = d->data[d->scanPos];
515  switch (d->nextState.token())
516  {
517  default:
518  break;
519  case TK_Identifier:
520  if (cur != '_' && (cur < 'A' || cur > 'Z') && (cur < 'a' || cur > 'z') && (cur < '0' || cur > '9'))
521  end = d->scanPos;
522  break;
523  case TK_IntConst:
524  if (cur == '.' || (d->scanPos - 1 != start && cur == 'e'))
525  d->nextState.setToken(TK_FloatConst);
526  else if ((cur == 'x' || cur == 'X') && d->scanPos - 1 == start)
527  {
528  integerBase = 16;
529  break;
530  }
531  else
532  {
533  switch (integerBase)
534  {
535  default:
536  if (cur < '0' || cur > '9')
537  end = d->scanPos;
538  break;
539  case 8:
540  if (cur < '0' || cur > '7')
541  end = d->scanPos;
542  break;
543  case 16:
544  if ((cur < '0' || cur > '9') && (cur < 'A' || cur > 'F') && (cur < 'a' || cur > 'f'))
545  end = d->scanPos;
546  break;
547  }
548  break;
549  }
550  [[gnu::fallthrough]];
551  case TK_FloatConst:
552  if (cur < '0' || cur > '9')
553  {
554  if (!floatHasDecimal && cur == '.')
555  {
556  floatHasDecimal = true;
557  break;
558  }
559  else if (!floatHasExponent && cur == 'e')
560  {
561  floatHasDecimal = true;
562  floatHasExponent = true;
563  if (d->scanPos + 1 < d->length)
564  {
565  char next = d->data[d->scanPos + 1];
566  if ((next < '0' || next > '9') && next != '+' && next != '-')
567  end = d->scanPos;
568  else
569  d->scanPos++;
570  }
571  break;
572  }
573  end = d->scanPos;
574  }
575  break;
576  case TK_StringConst:
577  if (cur == '"')
578  {
579  stringFinished = true;
580  end = d->scanPos;
581  d->scanPos++;
582  }
583  else if (cur == '\\')
584  d->scanPos++; // Will add two since the loop automatically adds one
585  break;
586  }
587  if (start == end && !stringFinished)
588  d->scanPos++;
589  else
590  break;
591  }
592  // Handle small tokens at the end of a file.
593  if (d->scanPos == d->length && !stringFinished)
594  end = d->scanPos;
595  }
596 
597  d->nextState.setScanPos(d->scanPos);
598  if (end - start > 0 || stringFinished)
599  {
600  d->nextState.setStr(QByteArray(d->data + start, end - start));
601  if (d->nextState.token() == TK_FloatConst)
602  {
603  if (floatHasDecimal && d->nextState.str().length() == 1)
604  {
605  // Don't treat a lone '.' as a decimal.
606  d->nextState.setToken('.');
607  }
608  else
609  {
610  d->nextState.setDecimal(d->nextState.str().toDouble(nullptr));
611  d->nextState.setNumber(static_cast<int>(d->nextState.decimal()));
612  d->nextState.setBoolean(d->nextState.number() != 0);
613  }
614  }
615  else if (d->nextState.token() == TK_IntConst)
616  {
617  d->nextState.setNumber(d->nextState.str().toUInt(nullptr, integerBase));
618  d->nextState.setDecimal(d->nextState.number());
619  d->nextState.setBoolean(d->nextState.number() != 0);
620  }
621  else if (d->nextState.token() == TK_Identifier)
622  {
623  // Check for a boolean constant.
624  if (d->nextState.str().compare("true") == 0)
625  {
626  d->nextState.setToken(TK_BoolConst);
627  d->nextState.setBoolean(true);
628  }
629  else if (d->nextState.str().compare("false") == 0)
630  {
631  d->nextState.setToken(TK_BoolConst);
632  d->nextState.setBoolean(false);
633  }
634  }
635  else if (d->nextState.token() == TK_StringConst)
636  {
637  QString str = d->nextState.str();
638  d->nextState.setStr(unescape(str));
639  }
640  if (autoExpandState)
641  expandState();
642  return true;
643  }
644  d->nextState.setToken(TK_NoToken);
645  if (autoExpandState)
646  expandState();
647  return false;
648 }
649 
650 void Scanner::mustGetToken(unsigned char token)
651 {
652  if (!checkToken(token))
653  {
654  expandState();
655  if (token < TK_NumSpecialTokens && d->state.token() < TK_NumSpecialTokens)
656  scriptMessage(Scanner::ML_ERROR, "Expected '%s' but got '%s' instead.", TokenNames[token], TokenNames[static_cast<unsigned>(d->state.token())]);
657  else if (token < TK_NumSpecialTokens && d->state.token() >= TK_NumSpecialTokens)
658  scriptMessage(Scanner::ML_ERROR, "Expected '%s' but got '%c' instead.", TokenNames[token], d->state.token());
659  else if (token >= TK_NumSpecialTokens && d->state.token() < TK_NumSpecialTokens)
660  scriptMessage(Scanner::ML_ERROR, "Expected '%c' but got '%s' instead.", token, TokenNames[static_cast<unsigned>(d->state.token())]);
661  else
662  scriptMessage(Scanner::ML_ERROR, "Expected '%c' but got '%c' instead.", token, d->state.token());
663  }
664 }
665 
666 void Scanner::rewind()
667 {
668  d->needNext = false;
669  d->nextState = d->state;
670  d->state = d->prevState;
671  d->scanPos = d->state.scanPos();
672 
673  d->line = d->prevState.tokenLine();
674  d->logicalPosition = d->prevState.tokenLinePosition();
675 }
676 
677 const char *Scanner::scriptData() const
678 {
679  return d->data;
680 }
681 
682 void Scanner::scriptMessage(MessageLevel level, const char *error, ...) const
683 {
684  const char *messageLevel;
685  switch (level)
686  {
687  default:
688  messageLevel = "Notice";
689  break;
690  case ML_WARNING:
691  messageLevel = "Warning";
692  break;
693  case ML_ERROR:
694  messageLevel = "Error";
695  break;
696  }
697 
698  char *newMessage = new char[strlen(error) + d->scriptIdentifier.length() + 25];
699  sprintf(newMessage, "%s:%d:%d:%s: %s\n", d->scriptIdentifier.toUtf8().constData(), currentLine(), currentLinePos(), messageLevel, error);
700  va_list list;
701  va_start(list, error);
702  if (messageHandler)
703  messageHandler(level, newMessage, list);
704  else
705  vfprintf(stderr, newMessage, list);
706  va_end(list);
707  delete[] newMessage;
708 
709  if (!messageHandler && level == ML_ERROR)
710  exit(0);
711 }
712 
713 void Scanner::setScriptIdentifier(const QString &ident)
714 {
715  d->scriptIdentifier = ident;
716 }
717 
718 int Scanner::skipLine()
719 {
720  int ret = currentPos();
721  while (d->logicalPosition < d->length)
722  {
723  char thisChar = d->data[d->logicalPosition];
724  char nextChar = d->logicalPosition + 1 < d->length ? d->data[d->logicalPosition + 1] : 0;
725  if (thisChar == '\n' || thisChar == '\r')
726  {
727  ret = d->logicalPosition++; // Return the first newline character we see.
728  if (nextChar == '\r')
729  d->logicalPosition++;
730  incrementLine();
732  break;
733  }
734  d->logicalPosition++;
735  }
736  if (d->logicalPosition > d->scanPos)
737  {
738  d->scanPos = d->logicalPosition;
740  d->needNext = true;
741  d->logicalPosition = d->scanPos;
742  }
743  return ret;
744 }
745 
746 Scanner::ParserState &Scanner::state()
747 {
748  return d->state;
749 }
750 
751 const Scanner::ParserState &Scanner::state() const
752 {
753  return d->state;
754 }
755 
757 {
758  return d->scanPos < d->length;
759 }
760 
762 // NOTE: Be sure that '\\' is the first thing in the array otherwise it will re-escape.
763 static char escapeCharacters[] = {'\\', '"', 0};
764 const QString &Scanner::escape(QString &str)
765 {
766  for (unsigned int i = 0; escapeCharacters[i] != 0; i++)
767  {
768  // += 2 because we'll be inserting 1 character.
769  for (int p = 0; p < str.length() && (p = str.indexOf(escapeCharacters[i], p)) != -1; p += 2)
770  {
771  str.insert(p, '\\');
772  }
773  }
774  return str;
775 }
776 const QString &Scanner::unescape(QString &str)
777 {
778  for (unsigned int i = 0; escapeCharacters[i] != 0; i++)
779  {
780  QString sequence = "\\" + QString(escapeCharacters[i]);
781  for (int p = 0; p < str.length() && (p = str.indexOf(sequence, p)) != -1; p++)
782  str.replace(str.indexOf(sequence, p), 2, escapeCharacters[i]);
783  }
784  return str;
785 }
787 
788 Scanner::ParserState::ParserState()
789 {
790 }
791 
792 Scanner::ParserState::~ParserState()
793 {
794 }
795 
796 const QString &Scanner::ParserState::str() const
797 {
798  return d->str;
799 }
800 
801 void Scanner::ParserState::setStr(const QString &v)
802 {
803  d->str = v;
804 }
805 
806 unsigned int Scanner::ParserState::number() const
807 {
808  return d->number;
809 }
810 
811 void Scanner::ParserState::setNumber(unsigned int v)
812 {
813  d->number = v;
814 }
815 
816 double Scanner::ParserState::decimal() const
817 {
818  return d->decimal;
819 }
820 
821 void Scanner::ParserState::setDecimal(double v)
822 {
823  d->decimal = v;
824 }
825 
826 bool Scanner::ParserState::boolean() const
827 {
828  return d->boolean;
829 }
830 
831 void Scanner::ParserState::setBoolean(bool v)
832 {
833  d->boolean = v;
834 }
835 
836 char Scanner::ParserState::token() const
837 {
838  return d->token;
839 }
840 
841 void Scanner::ParserState::setToken(char v)
842 {
843  d->token = v;
844 }
845 
846 unsigned int Scanner::ParserState::tokenLine() const
847 {
848  return d->tokenLine;
849 }
850 
851 void Scanner::ParserState::setTokenLine(unsigned int v)
852 {
853  d->tokenLine = v;
854 }
855 
856 unsigned int Scanner::ParserState::tokenLinePosition() const
857 {
858  return d->tokenLinePosition;
859 }
860 
861 void Scanner::ParserState::setTokenLinePosition(unsigned int v)
862 {
863  d->tokenLinePosition = v;
864 }
865 
866 unsigned int Scanner::ParserState::scanPos() const
867 {
868  return d->scanPos;
869 }
870 
871 void Scanner::ParserState::setScanPos(unsigned int v)
872 {
873  d->scanPos = v;
874 }