cprover
smt2_tokenizer.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "smt2_tokenizer.h"
10 
11 #include <istream>
12 
14 {
15  // any non-empty sequence of letters, digits and the characters
16  // ~ ! @ $ % ^ & * _ - + = < > . ? /
17  // that does not start with a digit and is not a reserved word.
18 
19  return isalnum(ch) ||
20  ch=='~' || ch=='!' || ch=='@' || ch=='$' || ch=='%' ||
21  ch=='^' || ch=='&' || ch=='*' || ch=='_' || ch=='-' ||
22  ch=='+' || ch=='=' || ch=='<' || ch=='>' || ch=='.' ||
23  ch=='?' || ch=='/';
24 }
25 
27 {
28  // any non-empty sequence of letters, digits and the characters
29  // ~ ! @ $ % ^ & * _ - + = < > . ? /
30  // that does not start with a digit and is not a reserved word.
31 
32  buffer.clear();
33 
34  char ch;
35  while(in->get(ch))
36  {
38  {
39  buffer+=ch;
40  }
41  else
42  {
43  in->unget(); // put back
44  return SYMBOL;
45  }
46  }
47 
48  // eof -- this is ok here
49  if(buffer.empty())
50  return END_OF_FILE;
51  else
52  return SYMBOL;
53 }
54 
56 {
57  // we accept any sequence of digits and dots
58 
59  buffer.clear();
60 
61  char ch;
62  while(in->get(ch))
63  {
64  if(isdigit(ch) || ch=='.')
65  {
66  buffer+=ch;
67  }
68  else
69  {
70  in->unget(); // put back
71  return NUMERAL;
72  }
73  }
74 
75  // eof -- this is ok here
76  if(buffer.empty())
77  return END_OF_FILE;
78  else
79  return NUMERAL;
80 }
81 
83 {
84  // we accept any sequence of '0' or '1'
85 
86  buffer.clear();
87  buffer+='#';
88  buffer+='b';
89 
90  char ch;
91  while(in->get(ch))
92  {
93  if(ch=='0' || ch=='1')
94  {
95  buffer+=ch;
96  }
97  else
98  {
99  in->unget(); // put back
100  return NUMERAL;
101  }
102  }
103 
104  // eof -- this is ok here
105  if(buffer.empty())
106  return END_OF_FILE;
107  else
108  return NUMERAL;
109 }
110 
112 {
113  // we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'
114 
115  buffer.clear();
116  buffer+='#';
117  buffer+='x';
118 
119  char ch;
120  while(in->get(ch))
121  {
122  if(isxdigit(ch))
123  {
124  buffer+=ch;
125  }
126  else
127  {
128  in->unget(); // put back
129  return NUMERAL;
130  }
131  }
132 
133  // eof -- this is ok here
134  if(buffer.empty())
135  return END_OF_FILE;
136  else
137  return NUMERAL;
138 }
139 
141 {
142  // any sequence of printable ASCII characters (including space,
143  // tab, and line-breaking characters) except for the backslash
144  // character \, that starts and ends with | and does not otherwise
145  // contain |
146 
147  buffer.clear();
148 
149  char ch;
150  while(in->get(ch))
151  {
152  if(ch=='|')
153  return SYMBOL; // done
154 
155  buffer+=ch;
156 
157  if(ch=='\n')
158  line_no++;
159  }
160 
161  // Hmpf. Eof before end of quoted symbol. This is an error.
162  error() << "EOF within quoted symbol" << eom;
163  return ERROR;
164 }
165 
167 {
168  buffer.clear();
169 
170  char ch;
171  while(in->get(ch))
172  {
173  if(ch=='"')
174  {
175  // quotes may be escaped by repeating
176  if(in->get(ch))
177  {
178  if(ch=='"')
179  {
180  }
181  else
182  {
183  in->unget();
184  return STRING_LITERAL; // done
185  }
186  }
187  else
188  return STRING_LITERAL; // done
189  }
190  buffer+=ch;
191  }
192 
193  // Hmpf. Eof before end of string literal. This is an error.
194  error() << "EOF within string literal" << eom;
195  return ERROR;
196 }
197 
199 {
200  if(peeked)
201  return peeked=false, token;
202 
203  char ch;
204 
205  while(in->get(ch))
206  {
207  switch(ch)
208  {
209  case '\n':
210  line_no++;
211  break;
212 
213  case ' ':
214  case '\r':
215  case '\t':
216  case static_cast<char>(160): // non-breaking space
217  // skip any whitespace
218  break;
219 
220  case ';': // comment
221  // skip until newline
222  while(in->get(ch))
223  {
224  if(ch=='\n')
225  {
226  line_no++;
227  break;
228  }
229  }
230  break;
231 
232  case '(':
233  // produce sub-expression
234  return token=OPEN;
235 
236  case ')':
237  // done with sub-expression
238  return token=CLOSE;
239 
240  case '|': // quoted symbol
241  return token=get_quoted_symbol();
242 
243  case '"': // string literal
244  return token=get_string_literal();
245 
246  case ':': // keyword
247  return token=get_simple_symbol();
248 
249  case '#':
250  if(in->get(ch))
251  {
252  if(ch=='b')
253  return token=get_bin_numeral();
254  else if(ch=='x')
255  return token=get_hex_numeral();
256  else
257  {
258  error() << "unknown numeral token" << eom;
259  return token=ERROR;
260  }
261  }
262  else
263  {
264  error() << "unexpected EOF in numeral token" << eom;
265  return token=ERROR;
266  }
267  break;
268 
269  default: // likely a simple symbol or a numeral
270  if(isdigit(ch))
271  {
272  in->unget();
273  return token=get_decimal_numeral();
274  }
275  else if(is_simple_symbol_character(ch))
276  {
277  in->unget();
278  return token=get_simple_symbol();
279  }
280  else
281  {
282  // illegal character, error
283  error() << "unexpected character `" << ch << '\'' << eom;
284  return token=ERROR;
285  }
286  }
287  }
288 
289  return token=END_OF_FILE;
290 }
std::istream * in
Definition: parser.h:26
tokent get_decimal_numeral()
static mstreamt & eom(mstreamt &m)
Definition: message.h:272
tokent get_bin_numeral()
tokent get_hex_numeral()
enum { NONE, END_OF_FILE, ERROR, STRING_LITERAL, NUMERAL, SYMBOL, OPEN, CLOSE } tokent
tokent get_string_literal()
static bool is_simple_symbol_character(char)
#define STRING_LITERAL
tokent get_quoted_symbol()
unsigned line_no
Definition: parser.h:136
std::string buffer
#define CLOSE
Definition: xml_y.tab.cpp:147
mstreamt & error()
tokent get_simple_symbol()