1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10 * is licensed to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24 package org.modeshape.common.text;
25
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.ListIterator;
30 import java.util.NoSuchElementException;
31 import net.jcip.annotations.Immutable;
32 import net.jcip.annotations.NotThreadSafe;
33 import org.modeshape.common.CommonI18n;
34 import org.modeshape.common.util.CheckArg;
35 import org.modeshape.common.xml.XmlCharacters;
36
37 /**
38 * A foundation for basic parsers that tokenizes input content and allows parsers to easily access and use those tokens. A
39 * {@link TokenStream} object literally represents the stream of {@link Token} objects that each represent a word, symbol, comment
40 * or other lexically-relevant piece of information. This simple framework makes it very easy to create a parser that walks
41 * through (or "consumes") the tokens in the order they appear and do something useful with that content (usually creating another
42 * representation of the content, such as some domain-specific Abstract Syntax Tree or object model).
43 * <p>
44 * </p>
45 * <h3>The parts</h3>
46 * <p>
47 * This simple framework consists of a couple of pieces that fit together to do the whole job of parsing input content.
48 * </p>
49 * <p>
50 * The {@link Tokenizer} is responsible for consuming the character-level input content and constructing {@link Token} objects for
51 * the different words, symbols, or other meaningful elements contained in the content. Each Token object is a simple object that
52 * records the character(s) that make up the token's value, but it does this in a very lightweight and efficient way by pointing
53 * to the original character stream. Each token can be assigned a parser-specific integral <i>token type</i> that may make it
54 * easier to do quickly figure out later in the process what kind of information each token represents. The general idea is to
55 * keep the Tokenizer logic very simple, and very often Tokenizers will merely look for the different kinds of characters (e.g.,
56 * symbols, letters, digits, etc.) as well as things like quoted strings and comments. However, Tokenizers are never called by the
57 * parser, but instead are always given to the TokenStream that then calls the Tokenizer at the appropriate time.
58 * </p>
59 * <p>
60 * The {@link TokenStream} is supplied the input content, a Tokenizer implementation, and a few options. Its job is to prepare the
61 * content for processing, call the Tokenizer implementation to create the series of Token objects, and then provide an interface
62 * for walking through and consuming the tokens. This interface makes it possible to discover the value and type of the current
63 * token, and consume the current token and move to the next token. Plus, the interface has been designed to make the code that
64 * works with the tokens to be as readable as possible.
65 * </p>
66 * <p>
67 * The final component in this framework is the <b>Parser</b>. The parser is really any class that takes as input the content to
68 * be parsed and that outputs some meaningful information. The parser will do this by defining the Tokenizer, constructing a
69 * TokenStream object, and then using the TokenStream to walk through the sequence of Tokens and produce some meaningful
70 * representation of the content. Parsers can create instances of some object model, or they can create a domain-specific Abstract
71 * Syntax Tree representation.
72 * </p>
73 * <p>
74 * The benefit of breaking the responsibility along these lines is that the TokenStream implementation is able to encapsulate
75 * quite a bit of very tedious and very useful functionality, while still allowing a lot of flexibility as to what makes up the
76 * different tokens. It also makes the parser very easy to write and read (and thus maintain), without placing very many
77 * restrictions on how that logic is to be defined. Plus, because the TokenStream takes responsibility for tracking the positions
78 * of every token (including line and column numbers), it can automatically produce meaningful errors.
79 * </p>
80 * <h3>Consuming tokens</h3>
81 * <p>
82 * A parser works with the tokens on the TokenStream using a variety of methods:
83 * <ul>
84 * <li>The {@link #start()} method must be called before any of the other methods. It performs initialization and tokenizing, and
85 * prepares the internal state by finding the first token and setting an internal <i>current token</i> reference.</li>
86 * <li>The {@link #hasNext()} method can be called repeatedly to determine if there is another token after the <i>current
87 * token</i>. This is often useful when an unknown number of tokens is to be processed, and behaves very similarly to the
88 * {@link Iterator#hasNext()} method.</li>
89 * <li>The {@link #consume()} method returns the {@link Token#value() value} of the <i>current token</i> and moves the <i>current
90 * token</i> pointer to the next available token.</li>
91 * <li>The {@link #consume(String)} and {@link #consume(char)} methods look at the <i>current token</i> and ensure the token's
92 * {@link Token#value() value} matches the value supplied as a method parameter, or they throw a {@link ParsingException} if the
93 * values don't match. The {@link #consume(int)} method works similarly, except that it attempts to match the token's
94 * {@link Token#type() type}. And, the {@link #consume(String, String...)} is a convenience method that is equivalent to calling
95 * {@link #consume(String)} for each of the arguments.</li>
96 * <li>The {@link #canConsume(String)} and {@link #canConsume(char)} methods look at the <i>current token</i> and check whether
97 * the token's {@link Token#value() value} matches the value supplied as a method parameter. If there is a match, the method
98 * advances the <i>current token</i> reference and returns true. Otherwise, the <i>current token</i> does not match and the method
99 * returns false without advancing the <i>current token</i> reference or throwing a ParsingException. Similarly, the
100 * {@link #canConsume(int)} method checks the token's {@link Token#type() type} rather than the value, consuming the token and
101 * returning true if there is a match, or just returning false if there is no match. The {@link #canConsume(String, String...)}
102 * method determines whether all of the supplied values can be consumed in the given order.</li>
103 * <li>The {@link #matches(String)} and {@link #matches(char)} methods look at the <i>current token</i> and check whether the
104 * token's {@link Token#value() value} matches the value supplied as a method parameter. The method then returns whether there was
105 * a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the {@link #matches(int)} method checks the
106 * token's {@link Token#type() type} rather than the value. The {@link #matches(String, String...)} method is a convenience method
107 * that is equivalent to calling {@link #matches(String)} for each of the arguments, and the {@link #matches(int, int...)} method
108 * is a convenience method that is equivalent to calling {@link #matches(int)} for each of the arguments.</li>
109 * </ul>
110 * <li>The {@link #matchesAnyOf(String, String...)} methods look at the <i>current token</i> and check whether the token's
111 * {@link Token#value() value} matches at least one of the values supplied as method parameters. The method then returns whether
112 * there was a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the
113 * {@link #matchesAnyOf(int, int...)} method checks the token's {@link Token#type() type} rather than the value.</li> </ul>
114 * </p>
115 * <p>
116 * With these methods, it's very easy to create a parser that looks at the current token to decide what to do, and then consume
117 * that token, and repeat this process.
118 * </p>
119 * <h3>Example parser</h3>
120 * <p>
121 * Here is an example of a very simple parser that parses very simple and limited SQL <code>SELECT</code> and <code>DELETE</code>
122 * statements, such as <code>SELECT * FROM Customers</code> or
123 * <code>SELECT Name, StreetAddress AS Address, City, Zip FROM Customers</code> or
124 * <code>DELETE FROM Customers WHERE Zip=12345</code>:
125 *
126 * <pre>
127 * public class SampleSqlSelectParser {
128 * public List<Statement> parse( String ddl ) {
129 * TokenStream tokens = new TokenStream(ddl, new SqlTokenizer(), false);
130 * List<Statement> statements = new LinkedList<Statement>();
131 * token.start();
132 * while (tokens.hasNext()) {
133 * if (tokens.matches("SELECT")) {
134 * statements.add(parseSelect(tokens));
135 * } else {
136 * statements.add(parseDelete(tokens));
137 * }
138 * }
139 * return statements;
140 * }
141 *
142 * protected Select parseSelect( TokenStream tokens ) throws ParsingException {
143 * tokens.consume("SELECT");
144 * List<Column> columns = parseColumns(tokens);
145 * tokens.consume("FROM");
146 * String tableName = tokens.consume();
147 * return new Select(tableName, columns);
148 * }
149 *
150 * protected List<Column> parseColumns( TokenStream tokens ) throws ParsingException {
151 * List<Column> columns = new LinkedList<Column>();
152 * if (tokens.matches('*')) {
153 * tokens.consume(); // leave the columns empty to signal wildcard
154 * } else {
155 * // Read names until we see a ','
156 * do {
157 * String columnName = tokens.consume();
158 * if (tokens.canConsume("AS")) {
159 * String columnAlias = tokens.consume();
160 * columns.add(new Column(columnName, columnAlias));
161 * } else {
162 * columns.add(new Column(columnName, null));
163 * }
164 * } while (tokens.canConsume(','));
165 * }
166 * return columns;
167 * }
168 *
169 * protected Delete parseDelete( TokenStream tokens ) throws ParsingException {
170 * tokens.consume("DELETE", "FROM");
171 * String tableName = tokens.consume();
172 * tokens.consume("WHERE");
173 * String lhs = tokens.consume();
174 * tokens.consume('=');
175 * String rhs = tokens.consume();
176 * return new Delete(tableName, new Criteria(lhs, rhs));
177 * }
178 * }
179 * public abstract class Statement { ... }
180 * public class Query extends Statement { ... }
181 * public class Delete extends Statement { ... }
182 * public class Column { ... }
183 * </pre>
184 *
185 * This example shows an idiomatic way of writing a parser that is stateless and thread-safe. The <code>parse(...)</code> method
186 * takes the input as a parameter, and returns the domain-specific representation that resulted from the parsing. All other
187 * methods are utility methods that simply encapsulate common logic or make the code more readable.
188 * </p>
189 * <p>
190 * In the example, the <code>parse(...)</code> first creates a TokenStream object (using a Tokenizer implementation that is not
191 * shown), and then loops as long as there are more tokens to read. As it loops, if the next token is "SELECT", the parser calls
192 * the <code>parseSelect(...)</code> method which immediately consumes a "SELECT" token, the names of the columns separated by
193 * commas (or a '*' if there all columns are to be selected), a "FROM" token, and the name of the table being queried. The
194 * <code>parseSelect(...)</code> method returns a <code>Select</code> object, which then added to the list of statements in the
195 * <code>parse(...)</code> method. The parser handles the "DELETE" statements in a similar manner.
196 * </p>
197 * <h3>Case sensitivity</h3>
198 * <p>
199 * Very often grammars to not require the case of keywords to match. This can make parsing a challenge, because all combinations
200 * of case need to be used. The TokenStream framework provides a very simple solution that requires no more effort than providing
201 * a boolean parameter to the constructor.
202 * </p>
203 * <p>
204 * When a <code>false</code> value is provided for the the <code>caseSensitive</code> parameter, the TokenStream performs all
205 * matching operations as if each token's value were in uppercase only. This means that the arguments supplied to the
206 * <code>match(...)</code>, <code>canConsume(...)</code>, and <code>consume(...)</code> methods should be upper-cased. Note that
207 * the <i>actual value</i> of each token remains the <i>actual</i> case as it appears in the input.
208 * </p>
209 * <p>
210 * Of course, when the TokenStream is created with a <code>true</code> value for the <code>caseSensitive</code> parameter, the
211 * matching is performed using the <i>actual</i> value as it appears in the input content
212 * </p>
213 * <h3>Whitespace</h3>
214 * <p>
215 * Many grammars are independent of lines breaks or whitespace, allowing a lot of flexibility when writing the content. The
216 * TokenStream framework makes it very easy to ignore line breaks and whitespace. To do so, the Tokenizer implementation must
217 * simply not include the line break character sequences and whitespace in the token ranges. Since none of the tokens contain
218 * whitespace, the parser never has to deal with them.
219 * </p>
220 * <p>
221 * Of course, many parsers will require that some whitespace be included. For example, whitespace within a quoted string may be
222 * needed by the parser. In this case, the Tokenizer should simply include the whitespace characters in the tokens.
223 * </p>
224 * <h3>Writing a Tokenizer</h3>
225 * <p>
226 * Each parser will likely have its own {@link Tokenizer} implementation that contains the parser-specific logic about how to
227 * break the content into token objects. Generally, the easiest way to do this is to simply iterate through the character sequence
228 * passed into the {@link Tokenizer#tokenize(CharacterStream, Tokens) tokenize(...)} method, and use a switch statement to decide
229 * what to do.
230 * </p>
231 * <p>
232 * Here is the code for a very basic Tokenizer implementation that ignores whitespace, line breaks and Java-style (multi-line and
233 * end-of-line) comments, while constructing single tokens for each quoted string.
234 *
235 * <pre>
236 * public class BasicTokenizer implements Tokenizer {
237 * public void tokenize( CharacterStream input,
238 * Tokens tokens ) throws ParsingException {
239 * while (input.hasNext()) {
240 * char c = input.next();
241 * switch (c) {
242 * case ' ':
243 * case '\t':
244 * case '\n':
245 * case '\r':
246 * // Just skip these whitespace characters ...
247 * break;
248 * case '-':
249 * case '(':
250 * case ')':
251 * case '{':
252 * case '}':
253 * case '*':
254 * case ',':
255 * case ';':
256 * case '+':
257 * case '%':
258 * case '?':
259 * case '$':
260 * case '[':
261 * case ']':
262 * case '!':
263 * case '<':
264 * case '>':
265 * case '|':
266 * case '=':
267 * case ':':
268 * tokens.addToken(input.index(), input.index() + 1, SYMBOL);
269 * break;
270 * case '.':
271 * tokens.addToken(input.index(), input.index() + 1, DECIMAL);
272 * break;
273 * case '\"':
274 * case '\"':
275 * int startIndex = input.index();
276 * Position startingPosition = input.position();
277 * boolean foundClosingQuote = false;
278 * while (input.hasNext()) {
279 * c = input.next();
280 * if (c == '\\' && input.isNext('"')) {
281 * c = input.next(); // consume the ' character since it is escaped
282 * } else if (c == '"') {
283 * foundClosingQuote = true;
284 * break;
285 * }
286 * }
287 * if (!foundClosingQuote) {
288 * throw new ParsingException(startingPosition, "No matching closing double quote found");
289 * }
290 * int endIndex = input.index() + 1; // beyond last character read
291 * tokens.addToken(startIndex, endIndex, DOUBLE_QUOTED_STRING);
292 * break;
293 * case '\'':
294 * startIndex = input.index();
295 * startingPosition = input.position();
296 * foundClosingQuote = false;
297 * while (input.hasNext()) {
298 * c = input.next();
299 * if (c == '\\' && input.isNext('\'')) {
300 * c = input.next(); // consume the ' character since it is escaped
301 * } else if (c == '\'') {
302 * foundClosingQuote = true;
303 * break;
304 * }
305 * }
306 * if (!foundClosingQuote) {
307 * throw new ParsingException(startingPosition, "No matching closing single quote found");
308 * }
309 * endIndex = input.index() + 1; // beyond last character read
310 * tokens.addToken(startIndex, endIndex, SINGLE_QUOTED_STRING);
311 * break;
312 * case '/':
313 * startIndex = input.index();
314 * if (input.isNext('/')) {
315 * // End-of-line comment ...
316 * boolean foundLineTerminator = false;
317 * while (input.hasNext()) {
318 * c = input.next();
319 * if (c == '\n' || c == '\r') {
320 * foundLineTerminator = true;
321 * break;
322 * }
323 * }
324 * endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
325 * if (!foundLineTerminator) ++endIndex; // must point beyond last char
326 * if (c == '\r' && input.isNext('\n')) input.next();
327 * if (useComments) {
328 * tokens.addToken(startIndex, endIndex, COMMENT);
329 * }
330 * } else if (input.isNext('*')) {
331 * // Multi-line comment ...
332 * while (input.hasNext() && !input.isNext('*', '/')) {
333 * c = input.next();
334 * }
335 * if (input.hasNext()) input.next(); // consume the '*'
336 * if (input.hasNext()) input.next(); // consume the '/'
337 * if (useComments) {
338 * endIndex = input.index() + 1; // the token will include the '/' and '*' characters
339 * tokens.addToken(startIndex, endIndex, COMMENT);
340 * }
341 * } else {
342 * // just a regular slash ...
343 * tokens.addToken(startIndex, startIndex + 1, SYMBOL);
344 * }
345 * break;
346 * default:
347 * startIndex = input.index();
348 * // Read until another whitespace/symbol/decimal/slash is found
349 * while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) {
350 * c = input.next();
351 * }
352 * endIndex = input.index() + 1; // beyond last character that was included
353 * tokens.addToken(startIndex, endIndex, WORD);
354 * }
355 * }
356 * }
357 * }
358 * </pre>
359 * Tokenizers with exactly this behavior can actually be created using the {@link #basicTokenizer(boolean)} method. So while this very
360 * basic implementation is not meant to be used in all situations, it may be useful in some situations.
361 * </p>
362 */
363 @NotThreadSafe
364 public class TokenStream {
365
366 /**
367 * A constant that can be used with the {@link #matches(String)}, {@link #matches(String, String...)},
368 * {@link #consume(String)}, {@link #consume(String, String...)}, {@link #canConsume(String)} and
369 * {@link #canConsume(String, String...)} methods to signal that any value is allowed to be matched.
370 * <p>
371 * Note that this exact instance must be used; an equivalent string will not work.
372 * </p>
373 */
374 public static final String ANY_VALUE = "any value";
375 /**
376 * A constant that can be used with the {@link #matches(int)}, {@link #matches(int, int...)}, {@link #consume(int)}, and
377 * {@link #canConsume(int)} methods to signal that any token type is allowed to be matched.
378 */
379 public static final int ANY_TYPE = Integer.MIN_VALUE;
380
381 protected final String inputString;
382 protected final String inputUppercased;
383 private final char[] inputContent;
384 private final boolean caseSensitive;
385 private final Tokenizer tokenizer;
386 private List<Token> tokens;
387 /**
388 * This class navigates the Token objects using this iterator. However, because it very often needs to access the
389 * "current token" in the "consume(...)" and "canConsume(...)" and "matches(...)" methods, the class caches a "current token"
390 * and makes this iterator point to the 2nd token.
391 *
392 * <pre>
393 * T1 T2 T3 T4 T5
394 * ˆ ˆ ˆ
395 * | | |
396 * | | +- The position of the tokenIterator, where tokenIterator.hasNext() will return T3
397 * | +---- The token referenced by currentToken
398 * +-------- The logical position of the TokenStream object, where the "consume()" would return T2
399 * </pre>
400 */
401 private ListIterator<Token> tokenIterator;
402 private Token currentToken;
403 private boolean completed;
404
405 public TokenStream( String content,
406 Tokenizer tokenizer,
407 boolean caseSensitive ) {
408 CheckArg.isNotNull(content, "content");
409 CheckArg.isNotNull(tokenizer, "tokenizer");
410 this.inputString = content;
411 this.inputContent = content.toCharArray();
412 this.caseSensitive = caseSensitive;
413 this.inputUppercased = caseSensitive ? inputString : content.toUpperCase();
414 this.tokenizer = tokenizer;
415 }
416
417 /**
418 * Begin the token stream, including (if required) the tokenization of the input content.
419 *
420 * @return this object for easy method chaining; never null
421 * @throws ParsingException if an error occurs during tokenization of the content
422 */
423 public TokenStream start() throws ParsingException {
424 // Create the tokens ...
425 if (tokens == null) {
426 TokenFactory tokenFactory = caseSensitive ? new CaseSensitiveTokenFactory() : new CaseInsensitiveTokenFactory();
427 CharacterStream characterStream = new CharacterArrayStream(inputContent);
428 tokenizer.tokenize(characterStream, tokenFactory);
429 this.tokens = initializeTokens(tokenFactory.getTokens());
430 }
431
432 // Create the iterator ...
433 tokenIterator = this.tokens.listIterator();
434 moveToNextToken();
435 return this;
436 }
437
438 /**
439 * Method to allow subclasses to preprocess the set of tokens and return the correct tokens to use. The default behavior is to
440 * simply return the supplied tokens.
441 *
442 * @param tokens
443 * @return list of tokens.
444 */
445 protected List<Token> initializeTokens( List<Token> tokens ) {
446 return tokens;
447 }
448
449 /**
450 * Method to allow tokens to be re-used from the start without re-tokenizing content.
451 */
452 public void rewind() {
453 // recreate the iterator ...
454 tokenIterator = this.tokens.listIterator();
455 completed = false;
456 currentToken = null;
457 moveToNextToken();
458 }
459
460 /**
461 * Get the position of the previous token.
462 *
463 * @return the previous token's position; never null
464 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
465 * @throws NoSuchElementException if there is no previous token
466 */
467 public Position previousPosition() {
468 return previousToken().position();
469 }
470
471 /**
472 * Get the position of the next (or current) token.
473 *
474 * @return the current token's position; never null
475 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
476 * @throws NoSuchElementException if there is no previous token
477 */
478 public Position nextPosition() {
479 return currentToken().position();
480 }
481
482 /**
483 * Convert the value of this token to an integer, return it, and move to the next token.
484 *
485 * @return the current token's value, converted to an integer
486 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer
487 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
488 */
489 public int consumeInteger() throws ParsingException, IllegalStateException {
490 if (completed) throwNoMoreContent();
491 // Get the value from the current token ...
492 String value = currentToken().value();
493 try {
494 int result = Integer.parseInt(value);
495 moveToNextToken();
496 return result;
497 } catch (NumberFormatException e) {
498 Position position = currentToken().position();
499 String msg = CommonI18n.expectingValidIntegerAtLineAndColumn.text(value, position.getLine(), position.getColumn());
500 throw new ParsingException(position, msg);
501 }
502 }
503
504 /**
505 * Convert the value of this token to a long, return it, and move to the next token.
506 *
507 * @return the current token's value, converted to an integer
508 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to a long
509 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
510 */
511 public long consumeLong() throws ParsingException, IllegalStateException {
512 if (completed) throwNoMoreContent();
513 // Get the value from the current token ...
514 String value = currentToken().value();
515 try {
516 long result = Long.parseLong(value);
517 moveToNextToken();
518 return result;
519 } catch (NumberFormatException e) {
520 Position position = currentToken().position();
521 String msg = CommonI18n.expectingValidLongAtLineAndColumn.text(value, position.getLine(), position.getColumn());
522 throw new ParsingException(position, msg);
523 }
524 }
525
526 /**
527 * Convert the value of this token to an integer, return it, and move to the next token.
528 *
529 * @return the current token's value, converted to an integer
530 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer
531 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
532 */
533 public boolean consumeBoolean() throws ParsingException, IllegalStateException {
534 if (completed) throwNoMoreContent();
535 // Get the value from the current token ...
536 String value = currentToken().value();
537 try {
538 boolean result = Boolean.parseBoolean(value);
539 moveToNextToken();
540 return result;
541 } catch (NumberFormatException e) {
542 Position position = currentToken().position();
543 String msg = CommonI18n.expectingValidBooleanAtLineAndColumn.text(value, position.getLine(), position.getColumn());
544 throw new ParsingException(position, msg);
545 }
546 }
547
548 /**
549 * Return the value of this token and move to the next token.
550 *
551 * @return the value of the current token
552 * @throws ParsingException if there is no such token to consume
553 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
554 */
555 public String consume() throws ParsingException, IllegalStateException {
556 if (completed) throwNoMoreContent();
557 // Get the value from the current token ...
558 String result = currentToken().value();
559 moveToNextToken();
560 return result;
561 }
562
563 protected void throwNoMoreContent() throws ParsingException {
564 String msg = CommonI18n.noMoreContent.text();
565 Position pos = tokens.isEmpty() ? new Position(-1, 1, 0) : tokens.get(tokens.size() - 1).position();
566 throw new ParsingException(pos, msg);
567 }
568
569 /**
570 * Attempt to consume this current token as long as it matches the expected value, or throw an exception if the token does not
571 * match.
572 * <p>
573 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
574 * </p>
575 *
576 * @param expected the expected value of the current token
577 * @throws ParsingException if the current token doesn't match the supplied value
578 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
579 */
580 public void consume( String expected ) throws ParsingException, IllegalStateException {
581 if (completed) {
582 String msg = CommonI18n.noMoreContentButWasExpectingToken.text(expected);
583 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg);
584 }
585 // Get the value from the current token ...
586 if (expected != ANY_VALUE && !currentToken().matches(expected)) {
587 String found = currentToken().value();
588 Position pos = currentToken().position();
589 String fragment = generateFragment();
590 String msg = CommonI18n.unexpectedToken.text(expected, found, pos.getLine(), pos.getColumn(), fragment);
591 throw new ParsingException(pos, msg);
592 }
593 moveToNextToken();
594 }
595
596 /**
597 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does
598 * not match.
599 *
600 * @param expected the expected character of the current token
601 * @throws ParsingException if the current token doesn't match the supplied value
602 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
603 */
604 public void consume( char expected ) throws ParsingException, IllegalStateException {
605 if (completed) {
606 String msg = CommonI18n.noMoreContentButWasExpectingCharacter.text(expected);
607 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg);
608 }
609 // Get the value from the current token ...
610 if (!currentToken().matches(expected)) {
611 String found = currentToken().value();
612 Position pos = currentToken().position();
613 String fragment = generateFragment();
614 String msg = CommonI18n.unexpectedCharacter.text(expected, found, pos.getLine(), pos.getColumn(), fragment);
615 throw new ParsingException(pos, msg);
616 }
617 moveToNextToken();
618 }
619
620 /**
621 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does
622 * not match.
623 * <p>
624 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard.
625 * </p>
626 *
627 * @param expectedType the expected token type of the current token
628 * @throws ParsingException if the current token doesn't match the supplied value
629 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
630 */
631 public void consume( int expectedType ) throws ParsingException, IllegalStateException {
632 if (completed) {
633 String msg = CommonI18n.noMoreContentButWasExpectingTokenType.text(expectedType);
634 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg);
635 }
636 // Get the value from the current token ...
637 if (expectedType != ANY_TYPE && currentToken().type() != expectedType) {
638 String found = currentToken().value();
639 Position pos = currentToken().position();
640 String fragment = generateFragment();
641 String msg = CommonI18n.unexpectedTokenType.text(expectedType, found, pos.getLine(), pos.getColumn(), fragment);
642 throw new ParsingException(pos, msg);
643 }
644 moveToNextToken();
645 }
646
647 /**
648 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception
649 * if the token does not match.
650 * <p>
651 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
652 * </p>
653 *
654 * @param expected the expected value of the current token
655 * @param expectedForNextTokens the expected values fo the following tokens
656 * @throws ParsingException if the current token doesn't match the supplied value
657 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
658 */
659 public void consume( String expected,
660 String... expectedForNextTokens ) throws ParsingException, IllegalStateException {
661 consume(expected);
662 for (String nextExpected : expectedForNextTokens) {
663 consume(nextExpected);
664 }
665 }
666
667 /**
668 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception
669 * if the token does not match.
670 * <p>
671 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
672 * </p>
673 *
674 * @param nextTokens the expected values for the next tokens
675 * @throws ParsingException if the current token doesn't match the supplied value
676 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
677 */
678 public void consume( String[] nextTokens ) throws ParsingException, IllegalStateException {
679 for (String nextExpected : nextTokens) {
680 consume(nextExpected);
681 }
682 }
683
684 /**
685 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception
686 * if the token does not match.
687 * <p>
688 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
689 * </p>
690 *
691 * @param nextTokens the expected values for the next tokens
692 * @throws ParsingException if the current token doesn't match the supplied value
693 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
694 */
695 public void consume( Iterable<String> nextTokens ) throws ParsingException, IllegalStateException {
696 for (String nextExpected : nextTokens) {
697 consume(nextExpected);
698 }
699 }
700
701 /**
702 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to
703 * consume the token.
704 * <p>
705 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected value as a wildcard.
706 * </p>
707 *
708 * @param expected the expected value of the current token token
709 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
710 * not consumed
711 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
712 */
713 public boolean canConsume( String expected ) throws IllegalStateException {
714 if (!matches(expected)) return false;
715 moveToNextToken();
716 return true;
717 }
718
719 /**
720 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to
721 * consume the token.
722 *
723 * @param expected the expected value of the current token token
724 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
725 * not consumed
726 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
727 */
728 public boolean canConsume( char expected ) throws IllegalStateException {
729 if (!matches(expected)) return false;
730 moveToNextToken();
731 return true;
732 }
733
734 /**
735 * Attempt to consume this current token if it matches the expected token type, and return whether this method was indeed able
736 * to consume the token.
737 * <p>
738 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected type as a wildcard.
739 * </p>
740 *
741 * @param expectedType the expected token type of the current token
742 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
743 * not consumed
744 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
745 */
746 public boolean canConsume( int expectedType ) throws IllegalStateException {
747 if (!matches(expectedType)) return false;
748 moveToNextToken();
749 return true;
750 }
751
752 /**
753 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether
754 * this method was indeed able to consume all of the supplied tokens.
755 * <p>
756 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method
757 * ensures that <i>all</i> of the supplied values can be consumed.
758 * </p>
759 * <p>
760 * This method <i>is</i> equivalent to calling the following:
761 *
762 * <pre>
763 *
764 * if (tokens.matches(currentExpected, expectedForNextTokens)) {
765 * tokens.consume(currentExpected, expectedForNextTokens);
766 * }
767 *
768 * </pre>
769 *
770 * </p>
771 * <p>
772 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
773 * </p>
774 *
775 * @param currentExpected the expected value of the current token
776 * @param expectedForNextTokens the expected values fo the following tokens
777 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
778 * not consumed
779 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
780 */
781 public boolean canConsume( String currentExpected,
782 String... expectedForNextTokens ) throws IllegalStateException {
783 if (completed) return false;
784 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
785 if (!iter.hasNext()) return false;
786 Token token = iter.next();
787 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false;
788 for (String nextExpected : expectedForNextTokens) {
789 if (!iter.hasNext()) return false;
790 token = iter.next();
791 if (nextExpected == ANY_VALUE) continue;
792 if (!token.matches(nextExpected)) return false;
793 }
794 this.tokenIterator = iter;
795 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null;
796 this.completed = this.currentToken == null;
797 return true;
798 }
799
800 /**
801 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether
802 * this method was indeed able to consume all of the supplied tokens.
803 * <p>
804 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method
805 * ensures that <i>all</i> of the supplied values can be consumed.
806 * </p>
807 * <p>
808 * This method <i>is</i> equivalent to calling the following:
809 *
810 * <pre>
811 *
812 * if (tokens.matches(currentExpected, expectedForNextTokens)) {
813 * tokens.consume(currentExpected, expectedForNextTokens);
814 * }
815 *
816 * </pre>
817 *
818 * </p>
819 * <p>
820 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
821 * </p>
822 *
823 * @param nextTokens the expected values of the next tokens
824 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
825 * not consumed
826 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
827 */
828 public boolean canConsume( String[] nextTokens ) throws IllegalStateException {
829 if (completed) return false;
830 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
831 Token token = null;
832 for (String nextExpected : nextTokens) {
833 if (!iter.hasNext()) return false;
834 token = iter.next();
835 if (nextExpected == ANY_VALUE) continue;
836 if (!token.matches(nextExpected)) return false;
837 }
838 this.tokenIterator = iter;
839 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null;
840 this.completed = this.currentToken == null;
841 return true;
842 }
843
844 /**
845 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether
846 * this method was indeed able to consume all of the supplied tokens.
847 * <p>
848 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method
849 * ensures that <i>all</i> of the supplied values can be consumed.
850 * </p>
851 * <p>
852 * This method <i>is</i> equivalent to calling the following:
853 *
854 * <pre>
855 *
856 * if (tokens.matches(currentExpected, expectedForNextTokens)) {
857 * tokens.consume(currentExpected, expectedForNextTokens);
858 * }
859 *
860 * </pre>
861 *
862 * </p>
863 * <p>
864 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
865 * </p>
866 *
867 * @param nextTokens the expected values of the next tokens
868 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was
869 * not consumed
870 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
871 */
872 public boolean canConsume( Iterable<String> nextTokens ) throws IllegalStateException {
873 if (completed) return false;
874 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
875 Token token = null;
876 for (String nextExpected : nextTokens) {
877 if (!iter.hasNext()) return false;
878 token = iter.next();
879 if (nextExpected == ANY_VALUE) continue;
880 if (!token.matches(nextExpected)) return false;
881 }
882 this.tokenIterator = iter;
883 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null;
884 this.completed = this.currentToken == null;
885 return true;
886 }
887
888 /**
889 * Attempt to consume the next token if it matches one of the supplied values.
890 *
891 * @param firstOption the first option for the value of the current token
892 * @param additionalOptions the additional options for the value of the current token
893 * @return true if the current token's value did match one of the suplied options, or false otherwise
894 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
895 */
896 public boolean canConsumeAnyOf( String firstOption,
897 String... additionalOptions ) throws IllegalStateException {
898 if (completed) return false;
899 if (canConsume(firstOption)) return true;
900 for (String nextOption : additionalOptions) {
901 if (canConsume(nextOption)) return true;
902 }
903 return false;
904 }
905
906 /**
907 * Attempt to consume the next token if it matches one of the supplied values.
908 *
909 * @param options the options for the value of the current token
910 * @return true if the current token's value did match one of the suplied options, or false otherwise
911 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
912 */
913 public boolean canConsumeAnyOf( String[] options ) throws IllegalStateException {
914 if (completed) return false;
915 for (String option : options) {
916 if (canConsume(option)) return true;
917 }
918 return false;
919 }
920
921 /**
922 * Attempt to consume the next token if it matches one of the supplied values.
923 *
924 * @param options the options for the value of the current token
925 * @return true if the current token's value did match one of the suplied options, or false otherwise
926 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
927 */
928 public boolean canConsumeAnyOf( Iterable<String> options ) throws IllegalStateException {
929 if (completed) return false;
930 for (String option : options) {
931 if (canConsume(option)) return true;
932 }
933 return false;
934 }
935
936 /**
937 * Attempt to consume the next token if it matches one of the supplied types.
938 *
939 * @param firstTypeOption the first option for the type of the current token
940 * @param additionalTypeOptions the additional options for the type of the current token
941 * @return true if the current token's type matched one of the supplied options, or false otherwise
942 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
943 */
944 public boolean canConsumeAnyOf( int firstTypeOption,
945 int... additionalTypeOptions ) throws IllegalStateException {
946 if (completed) return false;
947 if (canConsume(firstTypeOption)) return true;
948 for (int nextTypeOption : additionalTypeOptions) {
949 if (canConsume(nextTypeOption)) return true;
950 }
951 return false;
952 }
953
954 /**
955 * Attempt to consume the next token if it matches one of the supplied types.
956 *
957 * @param typeOptions the options for the type of the current token
958 * @return true if the current token's type matched one of the supplied options, or false otherwise
959 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
960 */
961 public boolean canConsumeAnyOf( int[] typeOptions ) throws IllegalStateException {
962 if (completed) return false;
963 for (int nextTypeOption : typeOptions) {
964 if (canConsume(nextTypeOption)) return true;
965 }
966 return false;
967 }
968
969 /**
970 * Determine if the current token matches the expected value.
971 * <p>
972 * The {@link #ANY_VALUE ANY_VALUE} constant can be used as a wildcard.
973 * </p>
974 *
975 * @param expected the expected value of the current token token
976 * @return true if the current token did match, or false if the current token did not match
977 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
978 */
979 public boolean matches( String expected ) throws IllegalStateException {
980 return !completed && (expected == ANY_VALUE || currentToken().matches(expected));
981 }
982
983 /**
984 * Determine if the current token matches the expected value.
985 *
986 * @param expected the expected value of the current token token
987 * @return true if the current token did match, or false if the current token did not match
988 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
989 */
990 public boolean matches( char expected ) throws IllegalStateException {
991 return !completed && currentToken().matches(expected);
992 }
993
994 /**
995 * Determine if the current token matches the expected token type.
996 *
997 * @param expectedType the expected token type of the current token
998 * @return true if the current token did match, or false if the current token did not match
999 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1000 */
1001 public boolean matches( int expectedType ) throws IllegalStateException {
1002 return !completed && currentToken().matches(expectedType);
1003 }
1004
1005 /**
1006 * Determine if the next few tokens match the expected values.
1007 * <p>
1008 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
1009 * </p>
1010 *
1011 * @param currentExpected the expected value of the current token
1012 * @param expectedForNextTokens the expected values for the following tokens
1013 * @return true if the tokens did match, or false otherwise
1014 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1015 */
1016 public boolean matches( String currentExpected,
1017 String... expectedForNextTokens ) throws IllegalStateException {
1018 if (completed) return false;
1019 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1020 if (!iter.hasNext()) return false;
1021 Token token = iter.next();
1022 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false;
1023 for (String nextExpected : expectedForNextTokens) {
1024 if (!iter.hasNext()) return false;
1025 token = iter.next();
1026 if (nextExpected == ANY_VALUE) continue;
1027 if (!token.matches(nextExpected)) return false;
1028 }
1029 return true;
1030 }
1031
1032 /**
1033 * Determine if the next few tokens match the expected values.
1034 * <p>
1035 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
1036 * </p>
1037 *
1038 * @param nextTokens the expected value of the next tokens
1039 * @return true if the tokens did match, or false otherwise
1040 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1041 */
1042 public boolean matches( String[] nextTokens ) throws IllegalStateException {
1043 if (completed) return false;
1044 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1045 Token token = null;
1046 for (String nextExpected : nextTokens) {
1047 if (!iter.hasNext()) return false;
1048 token = iter.next();
1049 if (nextExpected == ANY_VALUE) continue;
1050 if (!token.matches(nextExpected)) return false;
1051 }
1052 return true;
1053 }
1054
1055 /**
1056 * Determine if the next few tokens match the expected values.
1057 * <p>
1058 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard.
1059 * </p>
1060 *
1061 * @param nextTokens the expected value of the next tokens
1062 * @return true if the tokens did match, or false otherwise
1063 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1064 */
1065 public boolean matches( Iterable<String> nextTokens ) throws IllegalStateException {
1066 if (completed) return false;
1067 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1068 Token token = null;
1069 for (String nextExpected : nextTokens) {
1070 if (!iter.hasNext()) return false;
1071 token = iter.next();
1072 if (nextExpected == ANY_VALUE) continue;
1073 if (!token.matches(nextExpected)) return false;
1074 }
1075 return true;
1076 }
1077
1078 /**
1079 * Determine if the next few tokens have the supplied types.
1080 * <p>
1081 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard.
1082 * </p>
1083 *
1084 * @param currentExpectedType the expected type of the current token
1085 * @param expectedTypeForNextTokens the expected type for the following tokens
1086 * @return true if the tokens did match, or false otherwise
1087 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1088 */
1089 public boolean matches( int currentExpectedType,
1090 int... expectedTypeForNextTokens ) throws IllegalStateException {
1091 if (completed) return false;
1092 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1093 if (!iter.hasNext()) return false;
1094 Token token = iter.next();
1095 if (currentExpectedType != ANY_TYPE && currentToken().type() != currentExpectedType) return false;
1096 for (int nextExpectedType : expectedTypeForNextTokens) {
1097 if (!iter.hasNext()) return false;
1098 token = iter.next();
1099 if (nextExpectedType == ANY_TYPE) continue;
1100 if (token.type() != nextExpectedType) return false;
1101 }
1102 return true;
1103 }
1104
1105 /**
1106 * Determine if the next few tokens have the supplied types.
1107 * <p>
1108 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard.
1109 * </p>
1110 *
1111 * @param typesForNextTokens the expected type for each of the next tokens
1112 * @return true if the tokens did match, or false otherwise
1113 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1114 */
1115 public boolean matches( int[] typesForNextTokens ) throws IllegalStateException {
1116 if (completed) return false;
1117 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1118 Token token = null;
1119 for (int nextExpectedType : typesForNextTokens) {
1120 if (!iter.hasNext()) return false;
1121 token = iter.next();
1122 if (nextExpectedType == ANY_TYPE) continue;
1123 if (!token.matches(nextExpectedType)) return false;
1124 }
1125 return true;
1126 }
1127
1128 /**
1129 * Determine if the next token matches one of the supplied values.
1130 *
1131 * @param firstOption the first option for the value of the current token
1132 * @param additionalOptions the additional options for the value of the current token
1133 * @return true if the current token's value did match one of the suplied options, or false otherwise
1134 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1135 */
1136 public boolean matchesAnyOf( String firstOption,
1137 String... additionalOptions ) throws IllegalStateException {
1138 if (completed) return false;
1139 Token current = currentToken();
1140 if (current.matches(firstOption)) return true;
1141 for (String nextOption : additionalOptions) {
1142 if (current.matches(nextOption)) return true;
1143 }
1144 return false;
1145 }
1146
1147 /**
1148 * Determine if the next token matches one of the supplied values.
1149 *
1150 * @param options the options for the value of the current token
1151 * @return true if the current token's value did match one of the suplied options, or false otherwise
1152 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1153 */
1154 public boolean matchesAnyOf( String[] options ) throws IllegalStateException {
1155 if (completed) return false;
1156 Token current = currentToken();
1157 for (String option : options) {
1158 if (current.matches(option)) return true;
1159 }
1160 return false;
1161 }
1162
1163 /**
1164 * Determine if the next token matches one of the supplied values.
1165 *
1166 * @param options the options for the value of the current token
1167 * @return true if the current token's value did match one of the suplied options, or false otherwise
1168 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1169 */
1170 public boolean matchesAnyOf( Iterable<String> options ) throws IllegalStateException {
1171 if (completed) return false;
1172 Token current = currentToken();
1173 for (String option : options) {
1174 if (current.matches(option)) return true;
1175 }
1176 return false;
1177 }
1178
1179 /**
1180 * Determine if the next token have one of the supplied types.
1181 *
1182 * @param firstTypeOption the first option for the type of the current token
1183 * @param additionalTypeOptions the additional options for the type of the current token
1184 * @return true if the current token's type matched one of the supplied options, or false otherwise
1185 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1186 */
1187 public boolean matchesAnyOf( int firstTypeOption,
1188 int... additionalTypeOptions ) throws IllegalStateException {
1189 if (completed) return false;
1190 int currentType = currentToken().type();
1191 if (currentType == firstTypeOption) return true;
1192 for (int nextTypeOption : additionalTypeOptions) {
1193 if (currentType == nextTypeOption) return true;
1194 }
1195 return false;
1196 }
1197
1198 /**
1199 * Determine if the next token have one of the supplied types.
1200 *
1201 * @param typeOptions the options for the type of the current token
1202 * @return true if the current token's type matched one of the supplied options, or false otherwise
1203 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1204 */
1205 public boolean matchesAnyOf( int[] typeOptions ) throws IllegalStateException {
1206 if (completed) return false;
1207 int currentType = currentToken().type();
1208 for (int nextTypeOption : typeOptions) {
1209 if (currentType == nextTypeOption) return true;
1210 }
1211 return false;
1212 }
1213
1214 /**
1215 * Determine if this stream has another token to be consumed.
1216 *
1217 * @return true if there is another token ready for consumption, or false otherwise
1218 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1219 */
1220 public boolean hasNext() {
1221 if (tokenIterator == null) {
1222 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeNext.text());
1223 }
1224 return !completed;
1225 }
1226
1227 /**
1228 * {@inheritDoc}
1229 *
1230 * @see java.lang.Object#toString()
1231 */
1232 @Override
1233 public String toString() {
1234 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex());
1235 StringBuilder sb = new StringBuilder();
1236 if (iter.hasNext()) {
1237 sb.append(iter.next());
1238 int count = 1;
1239 while (iter.hasNext()) {
1240 if (count > 20) {
1241 sb.append(" ...");
1242 break;
1243 }
1244 sb.append(" ");
1245 ++count;
1246 sb.append(iter.next());
1247 }
1248 }
1249 return sb.toString();
1250 }
1251
1252 private void moveToNextToken() {
1253 // And move the currentToken to the next token ...
1254 if (!tokenIterator.hasNext()) {
1255 completed = true;
1256 currentToken = null;
1257 } else {
1258 currentToken = tokenIterator.next();
1259 }
1260 }
1261
1262 /**
1263 * Get the current token.
1264 *
1265 * @return the current token; never null
1266 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1267 * @throws NoSuchElementException if there are no more tokens
1268 */
1269 final Token currentToken() throws IllegalStateException, NoSuchElementException {
1270 if (currentToken == null) {
1271 if (completed) {
1272 throw new NoSuchElementException(CommonI18n.noMoreContent.text());
1273 }
1274 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text());
1275 }
1276 assert currentToken != null;
1277 return currentToken;
1278 }
1279
1280 /**
1281 * Gets the content string starting at the first position (inclusive) and continuing up to the end position (exclusive).
1282 *
1283 * @param starting the position marking the beginning of the desired content string.
1284 * @param end the position located directly after the returned content string; can be null, which means end of content
1285 * @return the content string; never null
1286 */
1287 public String getContentBetween( Position starting,
1288 Position end ) {
1289 CheckArg.isNotNull(starting, "starting");
1290
1291 int startIndex = starting.getIndexInContent();
1292 int endIndex = inputString.length();
1293 if (end != null) {
1294 endIndex = end.getIndexInContent();
1295 }
1296
1297 if (startIndex >= endIndex) {
1298 throw new IllegalArgumentException(CommonI18n.endPositionMustBeGreaterThanStartingPosition.text(startIndex, endIndex));
1299 }
1300
1301 return inputString.substring(startIndex, endIndex);
1302 }
1303
1304 /**
1305 * Get the previous token. This does not modify the state.
1306 *
1307 * @return the previous token; never null
1308 * @throws IllegalStateException if this method was called before the stream was {@link #start() started}
1309 * @throws NoSuchElementException if there is no previous token
1310 */
1311 final Token previousToken() throws IllegalStateException, NoSuchElementException {
1312 if (currentToken == null) {
1313 if (completed) {
1314 if (tokens.isEmpty()) {
1315 throw new NoSuchElementException(CommonI18n.noMoreContent.text());
1316 }
1317 return tokens.get(tokens.size() - 1);
1318 }
1319 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text());
1320 }
1321 if (tokenIterator.previousIndex() == 0) {
1322 throw new NoSuchElementException(CommonI18n.noMoreContent.text());
1323 }
1324 return tokens.get(tokenIterator.previousIndex() - 1);
1325 }
1326
1327 String generateFragment() {
1328 // Find the current position ...
1329 assert currentToken != null;
1330 int startIndex = currentToken.startIndex();
1331 return generateFragment(inputString, startIndex, 20, " ===>> ");
1332 }
1333
1334 /**
1335 * Utility method to generate a highlighted fragment of a particular point in the stream.
1336 *
1337 * @param content the content from which the fragment should be taken; may not be null
1338 * @param indexOfProblem the index of the problem point that should be highlighted; must be a valid index in the content
1339 * @param charactersToIncludeBeforeAndAfter the maximum number of characters before and after the problem point to include in
1340 * the fragment
1341 * @param highlightText the text that should be included in the fragment at the problem point to highlight the location, or an
1342 * empty string if there should be no highlighting
1343 * @return the highlighted fragment; never null
1344 */
1345 static String generateFragment( String content,
1346 int indexOfProblem,
1347 int charactersToIncludeBeforeAndAfter,
1348 String highlightText ) {
1349 assert content != null;
1350 assert indexOfProblem < content.length();
1351 // Find the substring that immediately precedes the current position ...
1352 int beforeStart = Math.max(0, indexOfProblem - charactersToIncludeBeforeAndAfter);
1353 String before = content.substring(beforeStart, indexOfProblem);
1354
1355 // Find the substring that immediately follows the current position ...
1356 int afterEnd = Math.min(indexOfProblem + charactersToIncludeBeforeAndAfter, content.length());
1357 String after = content.substring(indexOfProblem, afterEnd);
1358
1359 return before + (highlightText != null ? highlightText : "") + after;
1360 }
1361
1362 /**
1363 * Interface for a Tokenizer component responsible for processing the characters in a {@link CharacterStream} and constructing
1364 * the appropriate {@link Token} objects.
1365 */
1366 public static interface Tokenizer {
1367 /**
1368 * Process the supplied characters and construct the appropriate {@link Token} objects.
1369 *
1370 * @param input the character input stream; never null
1371 * @param tokens the factory for {@link Token} objects, which records the order in which the tokens are created
1372 * @throws ParsingException if there is an error while processing the character stream (e.g., a quote is not closed, etc.)
1373 */
1374 void tokenize( CharacterStream input,
1375 Tokens tokens ) throws ParsingException;
1376 }
1377
1378 /**
1379 * Interface used by a {@link Tokenizer} to iterate through the characters in the content input to the {@link TokenStream}.
1380 */
1381 public static interface CharacterStream {
1382
1383 /**
1384 * Determine if there is another character available in this stream.
1385 *
1386 * @return true if there is another character (and {@link #next()} can be called), or false otherwise
1387 */
1388 boolean hasNext();
1389
1390 /**
1391 * Obtain the next character value, and advance the stream.
1392 *
1393 * @return the next character
1394 * @throws NoSuchElementException if there is no {@link #hasNext() next character}
1395 */
1396 char next();
1397
1398 /**
1399 * Get the index for the last character returned from {@link #next()}.
1400 *
1401 * @return the index of the last character returned
1402 */
1403 int index();
1404
1405 /**
1406 * Get the position for the last character returned from {@link #next()}.
1407 *
1408 * @param startIndex
1409 * @return the position of the last character returned; never null
1410 */
1411 Position position( int startIndex );
1412
1413 /**
1414 * Determine if the next character on the sream is a {@link Character#isWhitespace(char) whitespace character}. This
1415 * method does <i>not</i> advance the stream.
1416 *
1417 * @return true if there is a {@link #next() next} character and it is a whitespace character, or false otherwise
1418 */
1419 boolean isNextWhitespace();
1420
1421 /**
1422 * Determine if the next character on the sream is a {@link Character#isLetterOrDigit(char) letter or digit}. This method
1423 * does <i>not</i> advance the stream.
1424 *
1425 * @return true if there is a {@link #next() next} character and it is a letter or digit, or false otherwise
1426 */
1427 boolean isNextLetterOrDigit();
1428
1429 /**
1430 * Determine if the next character on the sream is a {@link XmlCharacters#isValid(int) valid XML character}. This method
1431 * does <i>not</i> advance the stream.
1432 *
1433 * @return true if there is a {@link #next() next} character and it is a valid XML character, or false otherwise
1434 */
1435 boolean isNextValidXmlCharacter();
1436
1437 /**
1438 * Determine if the next character on the sream is a {@link XmlCharacters#isValidName(int) valid XML NCName character}.
1439 * This method does <i>not</i> advance the stream.
1440 *
1441 * @return true if there is a {@link #next() next} character and it is a valid XML Name character, or false otherwise
1442 */
1443 boolean isNextValidXmlNameCharacter();
1444
1445 /**
1446 * Determine if the next character on the sream is a {@link XmlCharacters#isValidNcName(int) valid XML NCName character}.
1447 * This method does <i>not</i> advance the stream.
1448 *
1449 * @return true if there is a {@link #next() next} character and it is a valid XML NCName character, or false otherwise
1450 */
1451 boolean isNextValidXmlNcNameCharacter();
1452
1453 /**
1454 * Determine if the next character on the sream is the supplied value. This method does <i>not</i> advance the stream.
1455 *
1456 * @param c the character value to compare to the next character on the stream
1457 * @return true if there is a {@link #next() next} character and it is the supplied character, or false otherwise
1458 */
1459 boolean isNext( char c );
1460
1461 /**
1462 * Determine if the next two characters on the stream match the supplied values. This method does <i>not</i> advance the
1463 * stream.
1464 *
1465 * @param nextChar the character value to compare to the next character on the stream
1466 * @param followingChar the character value to compare to the character immediately after the next character on the stream
1467 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and
1468 * the second matches <code>followingChar</code>
1469 */
1470 boolean isNext( char nextChar,
1471 char followingChar );
1472
1473 /**
1474 * Determine if the next three characters on the sream match the supplied values. This method does <i>not</i> advance the
1475 * stream.
1476 *
1477 * @param nextChar the character value to compare to the next character on the stream
1478 * @param nextChar2 the character value to compare to the second character on the stream
1479 * @param nextChar3 the character value to compare to the second character on the stream
1480 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and
1481 * the second matches <code>followingChar</code>
1482 */
1483 boolean isNext( char nextChar,
1484 char nextChar2,
1485 char nextChar3 );
1486
1487 /**
1488 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i>
1489 * advance the stream.
1490 *
1491 * @param characters the characters to match
1492 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false
1493 * otherwise
1494 */
1495 boolean isNextAnyOf( char[] characters );
1496
1497 /**
1498 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i>
1499 * advance the stream.
1500 *
1501 * @param characters the characters to match
1502 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false
1503 * otherwise
1504 */
1505 boolean isNextAnyOf( String characters );
1506
1507 }
1508
1509 /**
1510 * A factory for Token objects, used by a {@link Tokenizer} to create tokens in the correct order.
1511 */
1512 public static interface Tokens {
1513 /**
1514 * Create a single-character token at the supplied index in the character stream. The token type is set to 0, meaning this
1515 * is equivalent to calling <code>addToken(index,index+1)</code> or <code>addToken(index,index+1,0)</code>.
1516 *
1517 * @param position the position (line and column numbers) of this new token; may not be null
1518 * @param index the index of the character to appear in the token; must be a valid index in the stream
1519 */
1520 void addToken( Position position,
1521 int index );
1522
1523 /**
1524 * Create a single- or multi-character token with the characters in the range given by the starting and ending index in
1525 * the character stream. The character at the ending index is <i>not</i> included in the token (as this is standard
1526 * practice when using 0-based indexes). The token type is set to 0, meaning this is equivalent to calling <code>
1527 * addToken(startIndex,endIndex,0)</code> .
1528 *
1529 * @param position the position (line and column numbers) of this new token; may not be null
1530 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream
1531 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream
1532 */
1533 void addToken( Position position,
1534 int startIndex,
1535 int endIndex );
1536
1537 /**
1538 * Create a single- or multi-character token with the supplied type and with the characters in the range given by the
1539 * starting and ending index in the character stream. The character at the ending index is <i>not</i> included in the
1540 * token (as this is standard practice when using 0-based indexes).
1541 *
1542 * @param position the position (line and column numbers) of this new token; may not be null
1543 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream
1544 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream
1545 * @param type the type of the token
1546 */
1547 void addToken( Position position,
1548 int startIndex,
1549 int endIndex,
1550 int type );
1551 }
1552
1553 /**
1554 * The interface defining a token, which references the characters in the actual input character stream.
1555 *
1556 * @see CaseSensitiveTokenFactory
1557 * @see CaseInsensitiveTokenFactory
1558 */
1559 @Immutable
1560 public interface Token {
1561 /**
1562 * Get the value of the token, in actual case.
1563 *
1564 * @return the value
1565 */
1566 String value();
1567
1568 /**
1569 * Determine if the token matches the supplied string.
1570 *
1571 * @param expected the expected value
1572 * @return true if the token's value matches the supplied value, or false otherwise
1573 */
1574 boolean matches( String expected );
1575
1576 /**
1577 * Determine if the token matches the supplied character.
1578 *
1579 * @param expected the expected character value
1580 * @return true if the token's value matches the supplied character value, or false otherwise
1581 */
1582 boolean matches( char expected );
1583
1584 /**
1585 * Determine if the token matches the supplied type.
1586 *
1587 * @param expectedType the expected integer type
1588 * @return true if the token's value matches the supplied integer type, or false otherwise
1589 */
1590 boolean matches( int expectedType );
1591
1592 /**
1593 * Get the type of the token.
1594 *
1595 * @return the token's type
1596 */
1597 int type();
1598
1599 /**
1600 * Get the index in the raw stream for the first character in the token.
1601 *
1602 * @return the starting index of the token
1603 */
1604 int startIndex();
1605
1606 /**
1607 * Get the index in the raw stream past the last character in the token.
1608 *
1609 * @return the ending index of the token, which is past the last character
1610 */
1611 int endIndex();
1612
1613 /**
1614 * Get the length of the token, which is equivalent to <code>endIndex() - startIndex()</code>.
1615 *
1616 * @return the length
1617 */
1618 int length();
1619
1620 /**
1621 * Get the position of this token, which includes the line number and column number of the first character in the token.
1622 *
1623 * @return the position; never null
1624 */
1625 Position position();
1626
1627 /**
1628 * Bitmask ORed with existing type value.
1629 *
1630 * @param typeMask
1631 * @return copy of Token with new type
1632 */
1633 Token withType( int typeMask );
1634 }
1635
1636 /**
1637 * An immutable {@link Token} that implements matching using case-sensitive logic.
1638 */
1639 @Immutable
1640 protected class CaseSensitiveToken implements Token {
1641 private final int startIndex;
1642 private final int endIndex;
1643 private final int type;
1644 private final Position position;
1645
1646 public CaseSensitiveToken( int startIndex,
1647 int endIndex,
1648 int type,
1649 Position position ) {
1650 this.startIndex = startIndex;
1651 this.endIndex = endIndex;
1652 this.type = type;
1653 this.position = position;
1654 }
1655
1656 /**
1657 * {@inheritDoc}
1658 *
1659 * @see org.modeshape.common.text.TokenStream.Token#withType(int)
1660 */
1661 public Token withType( int typeMask ) {
1662 int type = this.type | typeMask;
1663 return new CaseSensitiveToken(startIndex, endIndex, type, position);
1664 }
1665
1666 /**
1667 * {@inheritDoc}
1668 *
1669 * @see org.modeshape.common.text.TokenStream.Token#type()
1670 */
1671 public final int type() {
1672 return type;
1673 }
1674
1675 /**
1676 * {@inheritDoc}
1677 *
1678 * @see org.modeshape.common.text.TokenStream.Token#startIndex()
1679 */
1680 public final int startIndex() {
1681 return startIndex;
1682 }
1683
1684 /**
1685 * {@inheritDoc}
1686 *
1687 * @see org.modeshape.common.text.TokenStream.Token#endIndex()
1688 */
1689 public final int endIndex() {
1690 return endIndex;
1691 }
1692
1693 /**
1694 * {@inheritDoc}
1695 *
1696 * @see org.modeshape.common.text.TokenStream.Token#length()
1697 */
1698 public final int length() {
1699 return endIndex - startIndex;
1700 }
1701
1702 /**
1703 * {@inheritDoc}
1704 *
1705 * @see org.modeshape.common.text.TokenStream.Token#matches(char)
1706 */
1707 public final boolean matches( char expected ) {
1708 return length() == 1 && matchString().charAt(startIndex) == expected;
1709 }
1710
1711 /**
1712 * {@inheritDoc}
1713 *
1714 * @see org.modeshape.common.text.TokenStream.Token#matches(java.lang.String)
1715 */
1716 public final boolean matches( String expected ) {
1717 return matchString().substring(startIndex, endIndex).equals(expected);
1718 }
1719
1720 /**
1721 * {@inheritDoc}
1722 *
1723 * @see org.modeshape.common.text.TokenStream.Token#matches(int)
1724 */
1725 public final boolean matches( int expectedType ) {
1726 return expectedType == ANY_TYPE || (currentToken().type() & expectedType) == expectedType;
1727 }
1728
1729 /**
1730 * {@inheritDoc}
1731 *
1732 * @see org.modeshape.common.text.TokenStream.Token#value()
1733 */
1734 public final String value() {
1735 return inputString.substring(startIndex, endIndex);
1736 }
1737
1738 /**
1739 * {@inheritDoc}
1740 *
1741 * @see org.modeshape.common.text.TokenStream.Token#position()
1742 */
1743 public Position position() {
1744 return position;
1745 }
1746
1747 protected String matchString() {
1748 return inputString;
1749 }
1750
1751 /**
1752 * {@inheritDoc}
1753 *
1754 * @see java.lang.Object#toString()
1755 */
1756 @Override
1757 public String toString() {
1758 return value();
1759 }
1760 }
1761
1762 @Immutable
1763 protected class CaseInsensitiveToken extends CaseSensitiveToken {
1764 public CaseInsensitiveToken( int startIndex,
1765 int endIndex,
1766 int type,
1767 Position position ) {
1768 super(startIndex, endIndex, type, position);
1769 }
1770
1771 /**
1772 * {@inheritDoc}
1773 *
1774 * @see org.modeshape.common.text.TokenStream.CaseSensitiveToken#matchString()
1775 */
1776 @Override
1777 protected String matchString() {
1778 return inputUppercased;
1779 }
1780
1781 /**
1782 * {@inheritDoc}
1783 *
1784 * @see org.modeshape.common.text.TokenStream.Token#withType(int)
1785 */
1786 @Override
1787 public Token withType( int typeMask ) {
1788 int type = this.type() | typeMask;
1789 return new CaseInsensitiveToken(startIndex(), endIndex(), type, position());
1790 }
1791 }
1792
1793 protected abstract class TokenFactory implements Tokens {
1794 protected final List<Token> tokens = new ArrayList<Token>();
1795
1796 /**
1797 * {@inheritDoc}
1798 *
1799 * @see org.modeshape.common.text.TokenStream.Tokens#addToken(Position, int)
1800 */
1801 public void addToken( Position position,
1802 int index ) {
1803 addToken(position, index, index + 1, 0);
1804 }
1805
1806 /**
1807 * {@inheritDoc}
1808 *
1809 * @see org.modeshape.common.text.TokenStream.Tokens#addToken(Position, int, int)
1810 */
1811 public final void addToken( Position position,
1812 int startIndex,
1813 int endIndex ) {
1814 addToken(position, startIndex, endIndex, 0);
1815 }
1816
1817 /**
1818 * @return tokens
1819 */
1820 public List<Token> getTokens() {
1821 return tokens;
1822 }
1823 }
1824
1825 public class CaseSensitiveTokenFactory extends TokenFactory {
1826 /**
1827 * {@inheritDoc}
1828 *
1829 * @see org.modeshape.common.text.TokenStream.TokenFactory#addToken(Position,int, int, int)
1830 */
1831 public void addToken( Position position,
1832 int startIndex,
1833 int endIndex,
1834 int type ) {
1835 tokens.add(new CaseSensitiveToken(startIndex, endIndex, type, position));
1836 }
1837 }
1838
1839 public class CaseInsensitiveTokenFactory extends TokenFactory {
1840 /**
1841 * {@inheritDoc}
1842 *
1843 * @see org.modeshape.common.text.TokenStream.TokenFactory#addToken(Position,int, int, int)
1844 */
1845 public void addToken( Position position,
1846 int startIndex,
1847 int endIndex,
1848 int type ) {
1849 tokens.add(new CaseInsensitiveToken(startIndex, endIndex, type, position));
1850 }
1851 }
1852
1853 /**
1854 * An implementation of {@link CharacterStream} that works with a single character array.
1855 */
1856 public static final class CharacterArrayStream implements CharacterStream {
1857 private final char[] content;
1858 private int lastIndex = -1;
1859 private final int maxIndex;
1860 private int lineNumber = 1;
1861 private int columnNumber = 0;
1862 private boolean nextCharMayBeLineFeed;
1863
1864 public CharacterArrayStream( char[] content ) {
1865 this.content = content;
1866 this.maxIndex = content.length - 1;
1867 }
1868
1869 /**
1870 * {@inheritDoc}
1871 *
1872 * @see org.modeshape.common.text.TokenStream.CharacterStream#hasNext()
1873 */
1874 public boolean hasNext() {
1875 return lastIndex < maxIndex;
1876 }
1877
1878 /**
1879 * {@inheritDoc}
1880 *
1881 * @see org.modeshape.common.text.TokenStream.CharacterStream#index()
1882 */
1883 public int index() {
1884 return lastIndex;
1885 }
1886
1887 /**
1888 * {@inheritDoc}
1889 *
1890 * @param startIndex
1891 * @return the position of the token. never null
1892 * @see org.modeshape.common.text.TokenStream.CharacterStream#position(int)
1893 */
1894 public Position position( int startIndex ) {
1895 return new Position(startIndex, lineNumber, columnNumber);
1896 }
1897
1898 /**
1899 * {@inheritDoc}
1900 *
1901 * @see org.modeshape.common.text.TokenStream.CharacterStream#next()
1902 */
1903 public char next() {
1904 if (lastIndex >= maxIndex) {
1905 throw new NoSuchElementException();
1906 }
1907 char result = content[++lastIndex];
1908 ++columnNumber;
1909 if (result == '\r') {
1910 nextCharMayBeLineFeed = true;
1911 ++lineNumber;
1912 columnNumber = 0;
1913 } else if (result == '\n') {
1914 if (!nextCharMayBeLineFeed) ++lineNumber;
1915 columnNumber = 0;
1916 } else if (nextCharMayBeLineFeed) {
1917 nextCharMayBeLineFeed = false;
1918 }
1919 return result;
1920 }
1921
1922 /**
1923 * {@inheritDoc}
1924 *
1925 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char)
1926 */
1927 public boolean isNext( char c ) {
1928 int nextIndex = lastIndex + 1;
1929 return nextIndex <= maxIndex && content[nextIndex] == c;
1930 }
1931
1932 /**
1933 * {@inheritDoc}
1934 *
1935 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char, char)
1936 */
1937 public boolean isNext( char nextChar1,
1938 char nextChar2 ) {
1939 int nextIndex1 = lastIndex + 1;
1940 int nextIndex2 = lastIndex + 2;
1941 return nextIndex2 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2;
1942 }
1943
1944 /**
1945 * {@inheritDoc}
1946 *
1947 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char, char, char)
1948 */
1949 public boolean isNext( char nextChar1,
1950 char nextChar2,
1951 char nextChar3 ) {
1952 int nextIndex1 = lastIndex + 1;
1953 int nextIndex2 = lastIndex + 2;
1954 int nextIndex3 = lastIndex + 3;
1955 return nextIndex3 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2
1956 && content[nextIndex3] == nextChar3;
1957 }
1958
1959 /**
1960 * {@inheritDoc}
1961 *
1962 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextAnyOf(char[])
1963 */
1964 public boolean isNextAnyOf( char[] characters ) {
1965 int nextIndex = lastIndex + 1;
1966 if (nextIndex <= maxIndex) {
1967 char nextChar = content[lastIndex + 1];
1968 for (char c : characters) {
1969 if (c == nextChar) return true;
1970 }
1971 }
1972 return false;
1973 }
1974
1975 /**
1976 * {@inheritDoc}
1977 *
1978 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextAnyOf(java.lang.String)
1979 */
1980 public boolean isNextAnyOf( String characters ) {
1981 int nextIndex = lastIndex + 1;
1982 if (nextIndex <= maxIndex) {
1983 char nextChar = content[lastIndex + 1];
1984 if (characters.indexOf(nextChar) != -1) return true;
1985 }
1986 return false;
1987 }
1988
1989 /**
1990 * {@inheritDoc}
1991 *
1992 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextWhitespace()
1993 */
1994 public boolean isNextWhitespace() {
1995 int nextIndex = lastIndex + 1;
1996 return nextIndex <= maxIndex && Character.isWhitespace(content[nextIndex]);
1997 }
1998
1999 /**
2000 * {@inheritDoc}
2001 *
2002 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextLetterOrDigit()
2003 */
2004 public boolean isNextLetterOrDigit() {
2005 int nextIndex = lastIndex + 1;
2006 return nextIndex <= maxIndex && Character.isLetterOrDigit(content[nextIndex]);
2007 }
2008
2009 /**
2010 * {@inheritDoc}
2011 *
2012 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlCharacter()
2013 */
2014 public boolean isNextValidXmlCharacter() {
2015 int nextIndex = lastIndex + 1;
2016 return nextIndex <= maxIndex && XmlCharacters.isValid(content[nextIndex]);
2017 }
2018
2019 /**
2020 * {@inheritDoc}
2021 *
2022 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlNameCharacter()
2023 */
2024 public boolean isNextValidXmlNameCharacter() {
2025 int nextIndex = lastIndex + 1;
2026 return nextIndex <= maxIndex && XmlCharacters.isValidName(content[nextIndex]);
2027 }
2028
2029 /**
2030 * {@inheritDoc}
2031 *
2032 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlNcNameCharacter()
2033 */
2034 public boolean isNextValidXmlNcNameCharacter() {
2035 int nextIndex = lastIndex + 1;
2036 return nextIndex <= maxIndex && XmlCharacters.isValidNcName(content[nextIndex]);
2037 }
2038 }
2039
2040 /**
2041 * Obtain a basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the
2042 * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
2043 * <p>
2044 * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for
2045 * those situations that happen to be able to use it.
2046 * </p>
2047 *
2048 * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments
2049 * should be stripped and not included in the token stream
2050 * @return the tokenizer; never null
2051 */
2052 public static BasicTokenizer basicTokenizer( boolean includeComments ) {
2053 return new BasicTokenizer(includeComments);
2054 }
2055
2056 /**
2057 * A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period
2058 * ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
2059 * <p>
2060 * Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations
2061 * that happen to be able to use it.
2062 * </p>
2063 */
2064 public static class BasicTokenizer implements Tokenizer {
2065 /**
2066 * The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made
2067 * up of non-whitespace and non-symbol characters.
2068 */
2069 public static final int WORD = 1;
2070 /**
2071 * The {@link Token#type() token type} for tokens that consist of an individual "symbol" character. The set of characters
2072 * includes: <code>-(){}*,;+%?$[]!<>|=:</code>
2073 */
2074 public static final int SYMBOL = 2;
2075 /**
2076 * The {@link Token#type() token type} for tokens that consist of an individual '.' character.
2077 */
2078 public static final int DECIMAL = 4;
2079 /**
2080 * The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote
2081 * characters are included if they are preceded (escaped) by a '\' character.
2082 */
2083 public static final int SINGLE_QUOTED_STRING = 8;
2084 /**
2085 * The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote
2086 * characters are included if they are preceded (escaped) by a '\' character.
2087 */
2088 public static final int DOUBLE_QUOTED_STRING = 16;
2089 /**
2090 * The {@link Token#type() token type} for tokens that consist of all the characters between "/*" and "*/" or between
2091 * "//" and the next line terminator (e.g., '\n', '\r' or "\r\n").
2092 */
2093 public static final int COMMENT = 32;
2094
2095 private final boolean useComments;
2096
2097 protected BasicTokenizer( boolean useComments ) {
2098 this.useComments = useComments;
2099 }
2100
2101 /**
2102 * {@inheritDoc}
2103 *
2104 * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens)
2105 */
2106 public void tokenize( CharacterStream input,
2107 Tokens tokens ) throws ParsingException {
2108 while (input.hasNext()) {
2109 char c = input.next();
2110 switch (c) {
2111 case ' ':
2112 case '\t':
2113 case '\n':
2114 case '\r':
2115 // Just skip these whitespace characters ...
2116 break;
2117 case '-':
2118 case '(':
2119 case ')':
2120 case '{':
2121 case '}':
2122 case '*':
2123 case ',':
2124 case ';':
2125 case '+':
2126 case '%':
2127 case '?':
2128 case '$':
2129 case '[':
2130 case ']':
2131 case '!':
2132 case '<':
2133 case '>':
2134 case '|':
2135 case '=':
2136 case ':':
2137 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
2138 break;
2139 case '.':
2140 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
2141 break;
2142 case '\"':
2143 int startIndex = input.index();
2144 Position startingPosition = input.position(startIndex);
2145 boolean foundClosingQuote = false;
2146 while (input.hasNext()) {
2147 c = input.next();
2148 if (c == '\\' && input.isNext('"')) {
2149 c = input.next(); // consume the ' character since it is escaped
2150 } else if (c == '"') {
2151 foundClosingQuote = true;
2152 break;
2153 }
2154 }
2155 if (!foundClosingQuote) {
2156 String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
2157 startingPosition.getColumn());
2158 throw new ParsingException(startingPosition, msg);
2159 }
2160 int endIndex = input.index() + 1; // beyond last character read
2161 tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
2162 break;
2163 case '\'':
2164 startIndex = input.index();
2165 startingPosition = input.position(startIndex);
2166 foundClosingQuote = false;
2167 while (input.hasNext()) {
2168 c = input.next();
2169 if (c == '\\' && input.isNext('\'')) {
2170 c = input.next(); // consume the ' character since it is escaped
2171 } else if (c == '\'') {
2172 foundClosingQuote = true;
2173 break;
2174 }
2175 }
2176 if (!foundClosingQuote) {
2177 String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
2178 startingPosition.getColumn());
2179 throw new ParsingException(startingPosition, msg);
2180 }
2181 endIndex = input.index() + 1; // beyond last character read
2182 tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
2183 break;
2184 case '/':
2185 startIndex = input.index();
2186 startingPosition = input.position(startIndex);
2187 if (input.isNext('/')) {
2188 // End-of-line comment ...
2189 boolean foundLineTerminator = false;
2190 while (input.hasNext()) {
2191 c = input.next();
2192 if (c == '\n' || c == '\r') {
2193 foundLineTerminator = true;
2194 break;
2195 }
2196 }
2197 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
2198 if (!foundLineTerminator) ++endIndex; // must point beyond last char
2199 if (c == '\r' && input.isNext('\n')) input.next();
2200 if (useComments) {
2201 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
2202 }
2203 } else if (input.isNext('*')) {
2204 // Multi-line comment ...
2205 while (input.hasNext() && !input.isNext('*', '/')) {
2206 c = input.next();
2207 }
2208 if (input.hasNext()) input.next(); // consume the '*'
2209 if (input.hasNext()) input.next(); // consume the '/'
2210 if (useComments) {
2211 endIndex = input.index() + 1; // the token will include the '/' and '*' characters
2212 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
2213 }
2214 } else {
2215 // just a regular slash ...
2216 tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
2217 }
2218 break;
2219 default:
2220 startIndex = input.index();
2221 startingPosition = input.position(startIndex);
2222 // Read until another whitespace/symbol/decimal/slash is found
2223 while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) {
2224 c = input.next();
2225 }
2226 endIndex = input.index() + 1; // beyond last character that was included
2227 tokens.addToken(startingPosition, startIndex, endIndex, WORD);
2228 }
2229 }
2230 }
2231 }
2232 }