1 /* 2 * ModeShape (http://www.modeshape.org) 3 * See the COPYRIGHT.txt file distributed with this work for information 4 * regarding copyright ownership. Some portions may be licensed 5 * to Red Hat, Inc. under one or more contributor license agreements. 6 * See the AUTHORS.txt file in the distribution for a full listing of 7 * individual contributors. 8 * 9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape 10 * is licensed to you under the terms of the GNU Lesser General Public License as 11 * published by the Free Software Foundation; either version 2.1 of 12 * the License, or (at your option) any later version. 13 * 14 * ModeShape is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with this software; if not, write to the Free 21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 23 */ 24 package org.modeshape.common.text; 25 26 import java.util.ArrayList; 27 import java.util.Iterator; 28 import java.util.List; 29 import java.util.ListIterator; 30 import java.util.NoSuchElementException; 31 import net.jcip.annotations.Immutable; 32 import net.jcip.annotations.NotThreadSafe; 33 import org.modeshape.common.CommonI18n; 34 import org.modeshape.common.util.CheckArg; 35 import org.modeshape.common.xml.XmlCharacters; 36 37 /** 38 * A foundation for basic parsers that tokenizes input content and allows parsers to easily access and use those tokens. A 39 * {@link TokenStream} object literally represents the stream of {@link Token} objects that each represent a word, symbol, comment 40 * or other lexically-relevant piece of information. This simple framework makes it very easy to create a parser that walks 41 * through (or "consumes") the tokens in the order they appear and do something useful with that content (usually creating another 42 * representation of the content, such as some domain-specific Abstract Syntax Tree or object model). 43 * <p> 44 * </p> 45 * <h3>The parts</h3> 46 * <p> 47 * This simple framework consists of a couple of pieces that fit together to do the whole job of parsing input content. 48 * </p> 49 * <p> 50 * The {@link Tokenizer} is responsible for consuming the character-level input content and constructing {@link Token} objects for 51 * the different words, symbols, or other meaningful elements contained in the content. Each Token object is a simple object that 52 * records the character(s) that make up the token's value, but it does this in a very lightweight and efficient way by pointing 53 * to the original character stream. Each token can be assigned a parser-specific integral <i>token type</i> that may make it 54 * easier to do quickly figure out later in the process what kind of information each token represents. The general idea is to 55 * keep the Tokenizer logic very simple, and very often Tokenizers will merely look for the different kinds of characters (e.g., 56 * symbols, letters, digits, etc.) as well as things like quoted strings and comments. However, Tokenizers are never called by the 57 * parser, but instead are always given to the TokenStream that then calls the Tokenizer at the appropriate time. 58 * </p> 59 * <p> 60 * The {@link TokenStream} is supplied the input content, a Tokenizer implementation, and a few options. Its job is to prepare the 61 * content for processing, call the Tokenizer implementation to create the series of Token objects, and then provide an interface 62 * for walking through and consuming the tokens. This interface makes it possible to discover the value and type of the current 63 * token, and consume the current token and move to the next token. Plus, the interface has been designed to make the code that 64 * works with the tokens to be as readable as possible. 65 * </p> 66 * <p> 67 * The final component in this framework is the <b>Parser</b>. The parser is really any class that takes as input the content to 68 * be parsed and that outputs some meaningful information. The parser will do this by defining the Tokenizer, constructing a 69 * TokenStream object, and then using the TokenStream to walk through the sequence of Tokens and produce some meaningful 70 * representation of the content. Parsers can create instances of some object model, or they can create a domain-specific Abstract 71 * Syntax Tree representation. 72 * </p> 73 * <p> 74 * The benefit of breaking the responsibility along these lines is that the TokenStream implementation is able to encapsulate 75 * quite a bit of very tedious and very useful functionality, while still allowing a lot of flexibility as to what makes up the 76 * different tokens. It also makes the parser very easy to write and read (and thus maintain), without placing very many 77 * restrictions on how that logic is to be defined. Plus, because the TokenStream takes responsibility for tracking the positions 78 * of every token (including line and column numbers), it can automatically produce meaningful errors. 79 * </p> 80 * <h3>Consuming tokens</h3> 81 * <p> 82 * A parser works with the tokens on the TokenStream using a variety of methods: 83 * <ul> 84 * <li>The {@link #start()} method must be called before any of the other methods. It performs initialization and tokenizing, and 85 * prepares the internal state by finding the first token and setting an internal <i>current token</i> reference.</li> 86 * <li>The {@link #hasNext()} method can be called repeatedly to determine if there is another token after the <i>current 87 * token</i>. This is often useful when an unknown number of tokens is to be processed, and behaves very similarly to the 88 * {@link Iterator#hasNext()} method.</li> 89 * <li>The {@link #consume()} method returns the {@link Token#value() value} of the <i>current token</i> and moves the <i>current 90 * token</i> pointer to the next available token.</li> 91 * <li>The {@link #consume(String)} and {@link #consume(char)} methods look at the <i>current token</i> and ensure the token's 92 * {@link Token#value() value} matches the value supplied as a method parameter, or they throw a {@link ParsingException} if the 93 * values don't match. The {@link #consume(int)} method works similarly, except that it attempts to match the token's 94 * {@link Token#type() type}. And, the {@link #consume(String, String...)} is a convenience method that is equivalent to calling 95 * {@link #consume(String)} for each of the arguments.</li> 96 * <li>The {@link #canConsume(String)} and {@link #canConsume(char)} methods look at the <i>current token</i> and check whether 97 * the token's {@link Token#value() value} matches the value supplied as a method parameter. If there is a match, the method 98 * advances the <i>current token</i> reference and returns true. Otherwise, the <i>current token</i> does not match and the method 99 * returns false without advancing the <i>current token</i> reference or throwing a ParsingException. Similarly, the 100 * {@link #canConsume(int)} method checks the token's {@link Token#type() type} rather than the value, consuming the token and 101 * returning true if there is a match, or just returning false if there is no match. The {@link #canConsume(String, String...)} 102 * method determines whether all of the supplied values can be consumed in the given order.</li> 103 * <li>The {@link #matches(String)} and {@link #matches(char)} methods look at the <i>current token</i> and check whether the 104 * token's {@link Token#value() value} matches the value supplied as a method parameter. The method then returns whether there was 105 * a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the {@link #matches(int)} method checks the 106 * token's {@link Token#type() type} rather than the value. The {@link #matches(String, String...)} method is a convenience method 107 * that is equivalent to calling {@link #matches(String)} for each of the arguments, and the {@link #matches(int, int...)} method 108 * is a convenience method that is equivalent to calling {@link #matches(int)} for each of the arguments.</li> 109 * </ul> 110 * <li>The {@link #matchesAnyOf(String, String...)} methods look at the <i>current token</i> and check whether the token's 111 * {@link Token#value() value} matches at least one of the values supplied as method parameters. The method then returns whether 112 * there was a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the 113 * {@link #matchesAnyOf(int, int...)} method checks the token's {@link Token#type() type} rather than the value.</li> </ul> 114 * </p> 115 * <p> 116 * With these methods, it's very easy to create a parser that looks at the current token to decide what to do, and then consume 117 * that token, and repeat this process. 118 * </p> 119 * <h3>Example parser</h3> 120 * <p> 121 * Here is an example of a very simple parser that parses very simple and limited SQL <code>SELECT</code> and <code>DELETE</code> 122 * statements, such as <code>SELECT * FROM Customers</code> or 123 * <code>SELECT Name, StreetAddress AS Address, City, Zip FROM Customers</code> or 124 * <code>DELETE FROM Customers WHERE Zip=12345</code>: 125 * 126 * <pre> 127 * public class SampleSqlSelectParser { 128 * public List<Statement> parse( String ddl ) { 129 * TokenStream tokens = new TokenStream(ddl, new SqlTokenizer(), false); 130 * List<Statement> statements = new LinkedList<Statement>(); 131 * token.start(); 132 * while (tokens.hasNext()) { 133 * if (tokens.matches("SELECT")) { 134 * statements.add(parseSelect(tokens)); 135 * } else { 136 * statements.add(parseDelete(tokens)); 137 * } 138 * } 139 * return statements; 140 * } 141 * 142 * protected Select parseSelect( TokenStream tokens ) throws ParsingException { 143 * tokens.consume("SELECT"); 144 * List<Column> columns = parseColumns(tokens); 145 * tokens.consume("FROM"); 146 * String tableName = tokens.consume(); 147 * return new Select(tableName, columns); 148 * } 149 * 150 * protected List<Column> parseColumns( TokenStream tokens ) throws ParsingException { 151 * List<Column> columns = new LinkedList<Column>(); 152 * if (tokens.matches('*')) { 153 * tokens.consume(); // leave the columns empty to signal wildcard 154 * } else { 155 * // Read names until we see a ',' 156 * do { 157 * String columnName = tokens.consume(); 158 * if (tokens.canConsume("AS")) { 159 * String columnAlias = tokens.consume(); 160 * columns.add(new Column(columnName, columnAlias)); 161 * } else { 162 * columns.add(new Column(columnName, null)); 163 * } 164 * } while (tokens.canConsume(',')); 165 * } 166 * return columns; 167 * } 168 * 169 * protected Delete parseDelete( TokenStream tokens ) throws ParsingException { 170 * tokens.consume("DELETE", "FROM"); 171 * String tableName = tokens.consume(); 172 * tokens.consume("WHERE"); 173 * String lhs = tokens.consume(); 174 * tokens.consume('='); 175 * String rhs = tokens.consume(); 176 * return new Delete(tableName, new Criteria(lhs, rhs)); 177 * } 178 * } 179 * public abstract class Statement { ... } 180 * public class Query extends Statement { ... } 181 * public class Delete extends Statement { ... } 182 * public class Column { ... } 183 * </pre> 184 * 185 * This example shows an idiomatic way of writing a parser that is stateless and thread-safe. The <code>parse(...)</code> method 186 * takes the input as a parameter, and returns the domain-specific representation that resulted from the parsing. All other 187 * methods are utility methods that simply encapsulate common logic or make the code more readable. 188 * </p> 189 * <p> 190 * In the example, the <code>parse(...)</code> first creates a TokenStream object (using a Tokenizer implementation that is not 191 * shown), and then loops as long as there are more tokens to read. As it loops, if the next token is "SELECT", the parser calls 192 * the <code>parseSelect(...)</code> method which immediately consumes a "SELECT" token, the names of the columns separated by 193 * commas (or a '*' if there all columns are to be selected), a "FROM" token, and the name of the table being queried. The 194 * <code>parseSelect(...)</code> method returns a <code>Select</code> object, which then added to the list of statements in the 195 * <code>parse(...)</code> method. The parser handles the "DELETE" statements in a similar manner. 196 * </p> 197 * <h3>Case sensitivity</h3> 198 * <p> 199 * Very often grammars to not require the case of keywords to match. This can make parsing a challenge, because all combinations 200 * of case need to be used. The TokenStream framework provides a very simple solution that requires no more effort than providing 201 * a boolean parameter to the constructor. 202 * </p> 203 * <p> 204 * When a <code>false</code> value is provided for the the <code>caseSensitive</code> parameter, the TokenStream performs all 205 * matching operations as if each token's value were in uppercase only. This means that the arguments supplied to the 206 * <code>match(...)</code>, <code>canConsume(...)</code>, and <code>consume(...)</code> methods should be upper-cased. Note that 207 * the <i>actual value</i> of each token remains the <i>actual</i> case as it appears in the input. 208 * </p> 209 * <p> 210 * Of course, when the TokenStream is created with a <code>true</code> value for the <code>caseSensitive</code> parameter, the 211 * matching is performed using the <i>actual</i> value as it appears in the input content 212 * </p> 213 * <h3>Whitespace</h3> 214 * <p> 215 * Many grammars are independent of lines breaks or whitespace, allowing a lot of flexibility when writing the content. The 216 * TokenStream framework makes it very easy to ignore line breaks and whitespace. To do so, the Tokenizer implementation must 217 * simply not include the line break character sequences and whitespace in the token ranges. Since none of the tokens contain 218 * whitespace, the parser never has to deal with them. 219 * </p> 220 * <p> 221 * Of course, many parsers will require that some whitespace be included. For example, whitespace within a quoted string may be 222 * needed by the parser. In this case, the Tokenizer should simply include the whitespace characters in the tokens. 223 * </p> 224 * <h3>Writing a Tokenizer</h3> 225 * <p> 226 * Each parser will likely have its own {@link Tokenizer} implementation that contains the parser-specific logic about how to 227 * break the content into token objects. Generally, the easiest way to do this is to simply iterate through the character sequence 228 * passed into the {@link Tokenizer#tokenize(CharacterStream, Tokens) tokenize(...)} method, and use a switch statement to decide 229 * what to do. 230 * </p> 231 * <p> 232 * Here is the code for a very basic Tokenizer implementation that ignores whitespace, line breaks and Java-style (multi-line and 233 * end-of-line) comments, while constructing single tokens for each quoted string. 234 * 235 * <pre> 236 * public class BasicTokenizer implements Tokenizer { 237 * public void tokenize( CharacterStream input, 238 * Tokens tokens ) throws ParsingException { 239 * while (input.hasNext()) { 240 * char c = input.next(); 241 * switch (c) { 242 * case ' ': 243 * case '\t': 244 * case '\n': 245 * case '\r': 246 * // Just skip these whitespace characters ... 247 * break; 248 * case '-': 249 * case '(': 250 * case ')': 251 * case '{': 252 * case '}': 253 * case '*': 254 * case ',': 255 * case ';': 256 * case '+': 257 * case '%': 258 * case '?': 259 * case '$': 260 * case '[': 261 * case ']': 262 * case '!': 263 * case '<': 264 * case '>': 265 * case '|': 266 * case '=': 267 * case ':': 268 * tokens.addToken(input.index(), input.index() + 1, SYMBOL); 269 * break; 270 * case '.': 271 * tokens.addToken(input.index(), input.index() + 1, DECIMAL); 272 * break; 273 * case '\"': 274 * case '\"': 275 * int startIndex = input.index(); 276 * Position startingPosition = input.position(); 277 * boolean foundClosingQuote = false; 278 * while (input.hasNext()) { 279 * c = input.next(); 280 * if (c == '\\' && input.isNext('"')) { 281 * c = input.next(); // consume the ' character since it is escaped 282 * } else if (c == '"') { 283 * foundClosingQuote = true; 284 * break; 285 * } 286 * } 287 * if (!foundClosingQuote) { 288 * throw new ParsingException(startingPosition, "No matching closing double quote found"); 289 * } 290 * int endIndex = input.index() + 1; // beyond last character read 291 * tokens.addToken(startIndex, endIndex, DOUBLE_QUOTED_STRING); 292 * break; 293 * case '\'': 294 * startIndex = input.index(); 295 * startingPosition = input.position(); 296 * foundClosingQuote = false; 297 * while (input.hasNext()) { 298 * c = input.next(); 299 * if (c == '\\' && input.isNext('\'')) { 300 * c = input.next(); // consume the ' character since it is escaped 301 * } else if (c == '\'') { 302 * foundClosingQuote = true; 303 * break; 304 * } 305 * } 306 * if (!foundClosingQuote) { 307 * throw new ParsingException(startingPosition, "No matching closing single quote found"); 308 * } 309 * endIndex = input.index() + 1; // beyond last character read 310 * tokens.addToken(startIndex, endIndex, SINGLE_QUOTED_STRING); 311 * break; 312 * case '/': 313 * startIndex = input.index(); 314 * if (input.isNext('/')) { 315 * // End-of-line comment ... 316 * boolean foundLineTerminator = false; 317 * while (input.hasNext()) { 318 * c = input.next(); 319 * if (c == '\n' || c == '\r') { 320 * foundLineTerminator = true; 321 * break; 322 * } 323 * } 324 * endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 325 * if (!foundLineTerminator) ++endIndex; // must point beyond last char 326 * if (c == '\r' && input.isNext('\n')) input.next(); 327 * if (useComments) { 328 * tokens.addToken(startIndex, endIndex, COMMENT); 329 * } 330 * } else if (input.isNext('*')) { 331 * // Multi-line comment ... 332 * while (input.hasNext() && !input.isNext('*', '/')) { 333 * c = input.next(); 334 * } 335 * if (input.hasNext()) input.next(); // consume the '*' 336 * if (input.hasNext()) input.next(); // consume the '/' 337 * if (useComments) { 338 * endIndex = input.index() + 1; // the token will include the '/' and '*' characters 339 * tokens.addToken(startIndex, endIndex, COMMENT); 340 * } 341 * } else { 342 * // just a regular slash ... 343 * tokens.addToken(startIndex, startIndex + 1, SYMBOL); 344 * } 345 * break; 346 * default: 347 * startIndex = input.index(); 348 * // Read until another whitespace/symbol/decimal/slash is found 349 * while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) { 350 * c = input.next(); 351 * } 352 * endIndex = input.index() + 1; // beyond last character that was included 353 * tokens.addToken(startIndex, endIndex, WORD); 354 * } 355 * } 356 * } 357 * } 358 * </pre> 359 * Tokenizers with exactly this behavior can actually be created using the {@link #basicTokenizer(boolean)} method. So while this very 360 * basic implementation is not meant to be used in all situations, it may be useful in some situations. 361 * </p> 362 */ 363 @NotThreadSafe 364 public class TokenStream { 365 366 /** 367 * A constant that can be used with the {@link #matches(String)}, {@link #matches(String, String...)}, 368 * {@link #consume(String)}, {@link #consume(String, String...)}, {@link #canConsume(String)} and 369 * {@link #canConsume(String, String...)} methods to signal that any value is allowed to be matched. 370 * <p> 371 * Note that this exact instance must be used; an equivalent string will not work. 372 * </p> 373 */ 374 public static final String ANY_VALUE = "any value"; 375 /** 376 * A constant that can be used with the {@link #matches(int)}, {@link #matches(int, int...)}, {@link #consume(int)}, and 377 * {@link #canConsume(int)} methods to signal that any token type is allowed to be matched. 378 */ 379 public static final int ANY_TYPE = Integer.MIN_VALUE; 380 381 protected final String inputString; 382 protected final String inputUppercased; 383 private final char[] inputContent; 384 private final boolean caseSensitive; 385 private final Tokenizer tokenizer; 386 private List<Token> tokens; 387 /** 388 * This class navigates the Token objects using this iterator. However, because it very often needs to access the 389 * "current token" in the "consume(...)" and "canConsume(...)" and "matches(...)" methods, the class caches a "current token" 390 * and makes this iterator point to the 2nd token. 391 * 392 * <pre> 393 * T1 T2 T3 T4 T5 394 * ˆ ˆ ˆ 395 * | | | 396 * | | +- The position of the tokenIterator, where tokenIterator.hasNext() will return T3 397 * | +---- The token referenced by currentToken 398 * +-------- The logical position of the TokenStream object, where the "consume()" would return T2 399 * </pre> 400 */ 401 private ListIterator<Token> tokenIterator; 402 private Token currentToken; 403 private boolean completed; 404 405 public TokenStream( String content, 406 Tokenizer tokenizer, 407 boolean caseSensitive ) { 408 CheckArg.isNotNull(content, "content"); 409 CheckArg.isNotNull(tokenizer, "tokenizer"); 410 this.inputString = content; 411 this.inputContent = content.toCharArray(); 412 this.caseSensitive = caseSensitive; 413 this.inputUppercased = caseSensitive ? inputString : content.toUpperCase(); 414 this.tokenizer = tokenizer; 415 } 416 417 /** 418 * Begin the token stream, including (if required) the tokenization of the input content. 419 * 420 * @return this object for easy method chaining; never null 421 * @throws ParsingException if an error occurs during tokenization of the content 422 */ 423 public TokenStream start() throws ParsingException { 424 // Create the tokens ... 425 if (tokens == null) { 426 TokenFactory tokenFactory = caseSensitive ? new CaseSensitiveTokenFactory() : new CaseInsensitiveTokenFactory(); 427 CharacterStream characterStream = new CharacterArrayStream(inputContent); 428 tokenizer.tokenize(characterStream, tokenFactory); 429 this.tokens = initializeTokens(tokenFactory.getTokens()); 430 } 431 432 // Create the iterator ... 433 tokenIterator = this.tokens.listIterator(); 434 moveToNextToken(); 435 return this; 436 } 437 438 /** 439 * Method to allow subclasses to preprocess the set of tokens and return the correct tokens to use. The default behavior is to 440 * simply return the supplied tokens. 441 * 442 * @param tokens 443 * @return list of tokens. 444 */ 445 protected List<Token> initializeTokens( List<Token> tokens ) { 446 return tokens; 447 } 448 449 /** 450 * Method to allow tokens to be re-used from the start without re-tokenizing content. 451 */ 452 public void rewind() { 453 // recreate the iterator ... 454 tokenIterator = this.tokens.listIterator(); 455 completed = false; 456 currentToken = null; 457 moveToNextToken(); 458 } 459 460 /** 461 * Get the position of the previous token. 462 * 463 * @return the previous token's position; never null 464 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 465 * @throws NoSuchElementException if there is no previous token 466 */ 467 public Position previousPosition() { 468 return previousToken().position(); 469 } 470 471 /** 472 * Get the position of the next (or current) token. 473 * 474 * @return the current token's position; never null 475 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 476 * @throws NoSuchElementException if there is no previous token 477 */ 478 public Position nextPosition() { 479 return currentToken().position(); 480 } 481 482 /** 483 * Convert the value of this token to an integer, return it, and move to the next token. 484 * 485 * @return the current token's value, converted to an integer 486 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer 487 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 488 */ 489 public int consumeInteger() throws ParsingException, IllegalStateException { 490 if (completed) throwNoMoreContent(); 491 // Get the value from the current token ... 492 String value = currentToken().value(); 493 try { 494 int result = Integer.parseInt(value); 495 moveToNextToken(); 496 return result; 497 } catch (NumberFormatException e) { 498 Position position = currentToken().position(); 499 String msg = CommonI18n.expectingValidIntegerAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 500 throw new ParsingException(position, msg); 501 } 502 } 503 504 /** 505 * Convert the value of this token to a long, return it, and move to the next token. 506 * 507 * @return the current token's value, converted to an integer 508 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to a long 509 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 510 */ 511 public long consumeLong() throws ParsingException, IllegalStateException { 512 if (completed) throwNoMoreContent(); 513 // Get the value from the current token ... 514 String value = currentToken().value(); 515 try { 516 long result = Long.parseLong(value); 517 moveToNextToken(); 518 return result; 519 } catch (NumberFormatException e) { 520 Position position = currentToken().position(); 521 String msg = CommonI18n.expectingValidLongAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 522 throw new ParsingException(position, msg); 523 } 524 } 525 526 /** 527 * Convert the value of this token to an integer, return it, and move to the next token. 528 * 529 * @return the current token's value, converted to an integer 530 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer 531 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 532 */ 533 public boolean consumeBoolean() throws ParsingException, IllegalStateException { 534 if (completed) throwNoMoreContent(); 535 // Get the value from the current token ... 536 String value = currentToken().value(); 537 try { 538 boolean result = Boolean.parseBoolean(value); 539 moveToNextToken(); 540 return result; 541 } catch (NumberFormatException e) { 542 Position position = currentToken().position(); 543 String msg = CommonI18n.expectingValidBooleanAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 544 throw new ParsingException(position, msg); 545 } 546 } 547 548 /** 549 * Return the value of this token and move to the next token. 550 * 551 * @return the value of the current token 552 * @throws ParsingException if there is no such token to consume 553 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 554 */ 555 public String consume() throws ParsingException, IllegalStateException { 556 if (completed) throwNoMoreContent(); 557 // Get the value from the current token ... 558 String result = currentToken().value(); 559 moveToNextToken(); 560 return result; 561 } 562 563 protected void throwNoMoreContent() throws ParsingException { 564 String msg = CommonI18n.noMoreContent.text(); 565 Position pos = tokens.isEmpty() ? new Position(-1, 1, 0) : tokens.get(tokens.size() - 1).position(); 566 throw new ParsingException(pos, msg); 567 } 568 569 /** 570 * Attempt to consume this current token as long as it matches the expected value, or throw an exception if the token does not 571 * match. 572 * <p> 573 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 574 * </p> 575 * 576 * @param expected the expected value of the current token 577 * @throws ParsingException if the current token doesn't match the supplied value 578 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 579 */ 580 public void consume( String expected ) throws ParsingException, IllegalStateException { 581 if (completed) { 582 String msg = CommonI18n.noMoreContentButWasExpectingToken.text(expected); 583 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 584 } 585 // Get the value from the current token ... 586 if (expected != ANY_VALUE && !currentToken().matches(expected)) { 587 String found = currentToken().value(); 588 Position pos = currentToken().position(); 589 String fragment = generateFragment(); 590 String msg = CommonI18n.unexpectedToken.text(expected, found, pos.getLine(), pos.getColumn(), fragment); 591 throw new ParsingException(pos, msg); 592 } 593 moveToNextToken(); 594 } 595 596 /** 597 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does 598 * not match. 599 * 600 * @param expected the expected character of the current token 601 * @throws ParsingException if the current token doesn't match the supplied value 602 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 603 */ 604 public void consume( char expected ) throws ParsingException, IllegalStateException { 605 if (completed) { 606 String msg = CommonI18n.noMoreContentButWasExpectingCharacter.text(expected); 607 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 608 } 609 // Get the value from the current token ... 610 if (!currentToken().matches(expected)) { 611 String found = currentToken().value(); 612 Position pos = currentToken().position(); 613 String fragment = generateFragment(); 614 String msg = CommonI18n.unexpectedCharacter.text(expected, found, pos.getLine(), pos.getColumn(), fragment); 615 throw new ParsingException(pos, msg); 616 } 617 moveToNextToken(); 618 } 619 620 /** 621 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does 622 * not match. 623 * <p> 624 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 625 * </p> 626 * 627 * @param expectedType the expected token type of the current token 628 * @throws ParsingException if the current token doesn't match the supplied value 629 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 630 */ 631 public void consume( int expectedType ) throws ParsingException, IllegalStateException { 632 if (completed) { 633 String msg = CommonI18n.noMoreContentButWasExpectingTokenType.text(expectedType); 634 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 635 } 636 // Get the value from the current token ... 637 if (expectedType != ANY_TYPE && currentToken().type() != expectedType) { 638 String found = currentToken().value(); 639 Position pos = currentToken().position(); 640 String fragment = generateFragment(); 641 String msg = CommonI18n.unexpectedTokenType.text(expectedType, found, pos.getLine(), pos.getColumn(), fragment); 642 throw new ParsingException(pos, msg); 643 } 644 moveToNextToken(); 645 } 646 647 /** 648 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 649 * if the token does not match. 650 * <p> 651 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 652 * </p> 653 * 654 * @param expected the expected value of the current token 655 * @param expectedForNextTokens the expected values fo the following tokens 656 * @throws ParsingException if the current token doesn't match the supplied value 657 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 658 */ 659 public void consume( String expected, 660 String... expectedForNextTokens ) throws ParsingException, IllegalStateException { 661 consume(expected); 662 for (String nextExpected : expectedForNextTokens) { 663 consume(nextExpected); 664 } 665 } 666 667 /** 668 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 669 * if the token does not match. 670 * <p> 671 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 672 * </p> 673 * 674 * @param nextTokens the expected values for the next tokens 675 * @throws ParsingException if the current token doesn't match the supplied value 676 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 677 */ 678 public void consume( String[] nextTokens ) throws ParsingException, IllegalStateException { 679 for (String nextExpected : nextTokens) { 680 consume(nextExpected); 681 } 682 } 683 684 /** 685 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 686 * if the token does not match. 687 * <p> 688 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 689 * </p> 690 * 691 * @param nextTokens the expected values for the next tokens 692 * @throws ParsingException if the current token doesn't match the supplied value 693 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 694 */ 695 public void consume( Iterable<String> nextTokens ) throws ParsingException, IllegalStateException { 696 for (String nextExpected : nextTokens) { 697 consume(nextExpected); 698 } 699 } 700 701 /** 702 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to 703 * consume the token. 704 * <p> 705 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected value as a wildcard. 706 * </p> 707 * 708 * @param expected the expected value of the current token token 709 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 710 * not consumed 711 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 712 */ 713 public boolean canConsume( String expected ) throws IllegalStateException { 714 if (!matches(expected)) return false; 715 moveToNextToken(); 716 return true; 717 } 718 719 /** 720 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to 721 * consume the token. 722 * 723 * @param expected the expected value of the current token token 724 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 725 * not consumed 726 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 727 */ 728 public boolean canConsume( char expected ) throws IllegalStateException { 729 if (!matches(expected)) return false; 730 moveToNextToken(); 731 return true; 732 } 733 734 /** 735 * Attempt to consume this current token if it matches the expected token type, and return whether this method was indeed able 736 * to consume the token. 737 * <p> 738 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected type as a wildcard. 739 * </p> 740 * 741 * @param expectedType the expected token type of the current token 742 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 743 * not consumed 744 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 745 */ 746 public boolean canConsume( int expectedType ) throws IllegalStateException { 747 if (!matches(expectedType)) return false; 748 moveToNextToken(); 749 return true; 750 } 751 752 /** 753 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 754 * this method was indeed able to consume all of the supplied tokens. 755 * <p> 756 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 757 * ensures that <i>all</i> of the supplied values can be consumed. 758 * </p> 759 * <p> 760 * This method <i>is</i> equivalent to calling the following: 761 * 762 * <pre> 763 * 764 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 765 * tokens.consume(currentExpected, expectedForNextTokens); 766 * } 767 * 768 * </pre> 769 * 770 * </p> 771 * <p> 772 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 773 * </p> 774 * 775 * @param currentExpected the expected value of the current token 776 * @param expectedForNextTokens the expected values fo the following tokens 777 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 778 * not consumed 779 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 780 */ 781 public boolean canConsume( String currentExpected, 782 String... expectedForNextTokens ) throws IllegalStateException { 783 if (completed) return false; 784 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 785 if (!iter.hasNext()) return false; 786 Token token = iter.next(); 787 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false; 788 for (String nextExpected : expectedForNextTokens) { 789 if (!iter.hasNext()) return false; 790 token = iter.next(); 791 if (nextExpected == ANY_VALUE) continue; 792 if (!token.matches(nextExpected)) return false; 793 } 794 this.tokenIterator = iter; 795 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 796 this.completed = this.currentToken == null; 797 return true; 798 } 799 800 /** 801 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 802 * this method was indeed able to consume all of the supplied tokens. 803 * <p> 804 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 805 * ensures that <i>all</i> of the supplied values can be consumed. 806 * </p> 807 * <p> 808 * This method <i>is</i> equivalent to calling the following: 809 * 810 * <pre> 811 * 812 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 813 * tokens.consume(currentExpected, expectedForNextTokens); 814 * } 815 * 816 * </pre> 817 * 818 * </p> 819 * <p> 820 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 821 * </p> 822 * 823 * @param nextTokens the expected values of the next tokens 824 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 825 * not consumed 826 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 827 */ 828 public boolean canConsume( String[] nextTokens ) throws IllegalStateException { 829 if (completed) return false; 830 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 831 Token token = null; 832 for (String nextExpected : nextTokens) { 833 if (!iter.hasNext()) return false; 834 token = iter.next(); 835 if (nextExpected == ANY_VALUE) continue; 836 if (!token.matches(nextExpected)) return false; 837 } 838 this.tokenIterator = iter; 839 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 840 this.completed = this.currentToken == null; 841 return true; 842 } 843 844 /** 845 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 846 * this method was indeed able to consume all of the supplied tokens. 847 * <p> 848 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 849 * ensures that <i>all</i> of the supplied values can be consumed. 850 * </p> 851 * <p> 852 * This method <i>is</i> equivalent to calling the following: 853 * 854 * <pre> 855 * 856 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 857 * tokens.consume(currentExpected, expectedForNextTokens); 858 * } 859 * 860 * </pre> 861 * 862 * </p> 863 * <p> 864 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 865 * </p> 866 * 867 * @param nextTokens the expected values of the next tokens 868 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 869 * not consumed 870 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 871 */ 872 public boolean canConsume( Iterable<String> nextTokens ) throws IllegalStateException { 873 if (completed) return false; 874 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 875 Token token = null; 876 for (String nextExpected : nextTokens) { 877 if (!iter.hasNext()) return false; 878 token = iter.next(); 879 if (nextExpected == ANY_VALUE) continue; 880 if (!token.matches(nextExpected)) return false; 881 } 882 this.tokenIterator = iter; 883 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 884 this.completed = this.currentToken == null; 885 return true; 886 } 887 888 /** 889 * Attempt to consume the next token if it matches one of the supplied values. 890 * 891 * @param firstOption the first option for the value of the current token 892 * @param additionalOptions the additional options for the value of the current token 893 * @return true if the current token's value did match one of the suplied options, or false otherwise 894 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 895 */ 896 public boolean canConsumeAnyOf( String firstOption, 897 String... additionalOptions ) throws IllegalStateException { 898 if (completed) return false; 899 if (canConsume(firstOption)) return true; 900 for (String nextOption : additionalOptions) { 901 if (canConsume(nextOption)) return true; 902 } 903 return false; 904 } 905 906 /** 907 * Attempt to consume the next token if it matches one of the supplied values. 908 * 909 * @param options the options for the value of the current token 910 * @return true if the current token's value did match one of the suplied options, or false otherwise 911 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 912 */ 913 public boolean canConsumeAnyOf( String[] options ) throws IllegalStateException { 914 if (completed) return false; 915 for (String option : options) { 916 if (canConsume(option)) return true; 917 } 918 return false; 919 } 920 921 /** 922 * Attempt to consume the next token if it matches one of the supplied values. 923 * 924 * @param options the options for the value of the current token 925 * @return true if the current token's value did match one of the suplied options, or false otherwise 926 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 927 */ 928 public boolean canConsumeAnyOf( Iterable<String> options ) throws IllegalStateException { 929 if (completed) return false; 930 for (String option : options) { 931 if (canConsume(option)) return true; 932 } 933 return false; 934 } 935 936 /** 937 * Attempt to consume the next token if it matches one of the supplied types. 938 * 939 * @param firstTypeOption the first option for the type of the current token 940 * @param additionalTypeOptions the additional options for the type of the current token 941 * @return true if the current token's type matched one of the supplied options, or false otherwise 942 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 943 */ 944 public boolean canConsumeAnyOf( int firstTypeOption, 945 int... additionalTypeOptions ) throws IllegalStateException { 946 if (completed) return false; 947 if (canConsume(firstTypeOption)) return true; 948 for (int nextTypeOption : additionalTypeOptions) { 949 if (canConsume(nextTypeOption)) return true; 950 } 951 return false; 952 } 953 954 /** 955 * Attempt to consume the next token if it matches one of the supplied types. 956 * 957 * @param typeOptions the options for the type of the current token 958 * @return true if the current token's type matched one of the supplied options, or false otherwise 959 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 960 */ 961 public boolean canConsumeAnyOf( int[] typeOptions ) throws IllegalStateException { 962 if (completed) return false; 963 for (int nextTypeOption : typeOptions) { 964 if (canConsume(nextTypeOption)) return true; 965 } 966 return false; 967 } 968 969 /** 970 * Determine if the current token matches the expected value. 971 * <p> 972 * The {@link #ANY_VALUE ANY_VALUE} constant can be used as a wildcard. 973 * </p> 974 * 975 * @param expected the expected value of the current token token 976 * @return true if the current token did match, or false if the current token did not match 977 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 978 */ 979 public boolean matches( String expected ) throws IllegalStateException { 980 return !completed && (expected == ANY_VALUE || currentToken().matches(expected)); 981 } 982 983 /** 984 * Determine if the current token matches the expected value. 985 * 986 * @param expected the expected value of the current token token 987 * @return true if the current token did match, or false if the current token did not match 988 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 989 */ 990 public boolean matches( char expected ) throws IllegalStateException { 991 return !completed && currentToken().matches(expected); 992 } 993 994 /** 995 * Determine if the current token matches the expected token type. 996 * 997 * @param expectedType the expected token type of the current token 998 * @return true if the current token did match, or false if the current token did not match 999 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1000 */ 1001 public boolean matches( int expectedType ) throws IllegalStateException { 1002 return !completed && currentToken().matches(expectedType); 1003 } 1004 1005 /** 1006 * Determine if the next few tokens match the expected values. 1007 * <p> 1008 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1009 * </p> 1010 * 1011 * @param currentExpected the expected value of the current token 1012 * @param expectedForNextTokens the expected values for the following tokens 1013 * @return true if the tokens did match, or false otherwise 1014 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1015 */ 1016 public boolean matches( String currentExpected, 1017 String... expectedForNextTokens ) throws IllegalStateException { 1018 if (completed) return false; 1019 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1020 if (!iter.hasNext()) return false; 1021 Token token = iter.next(); 1022 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false; 1023 for (String nextExpected : expectedForNextTokens) { 1024 if (!iter.hasNext()) return false; 1025 token = iter.next(); 1026 if (nextExpected == ANY_VALUE) continue; 1027 if (!token.matches(nextExpected)) return false; 1028 } 1029 return true; 1030 } 1031 1032 /** 1033 * Determine if the next few tokens match the expected values. 1034 * <p> 1035 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1036 * </p> 1037 * 1038 * @param nextTokens the expected value of the next tokens 1039 * @return true if the tokens did match, or false otherwise 1040 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1041 */ 1042 public boolean matches( String[] nextTokens ) throws IllegalStateException { 1043 if (completed) return false; 1044 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1045 Token token = null; 1046 for (String nextExpected : nextTokens) { 1047 if (!iter.hasNext()) return false; 1048 token = iter.next(); 1049 if (nextExpected == ANY_VALUE) continue; 1050 if (!token.matches(nextExpected)) return false; 1051 } 1052 return true; 1053 } 1054 1055 /** 1056 * Determine if the next few tokens match the expected values. 1057 * <p> 1058 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1059 * </p> 1060 * 1061 * @param nextTokens the expected value of the next tokens 1062 * @return true if the tokens did match, or false otherwise 1063 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1064 */ 1065 public boolean matches( Iterable<String> nextTokens ) throws IllegalStateException { 1066 if (completed) return false; 1067 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1068 Token token = null; 1069 for (String nextExpected : nextTokens) { 1070 if (!iter.hasNext()) return false; 1071 token = iter.next(); 1072 if (nextExpected == ANY_VALUE) continue; 1073 if (!token.matches(nextExpected)) return false; 1074 } 1075 return true; 1076 } 1077 1078 /** 1079 * Determine if the next few tokens have the supplied types. 1080 * <p> 1081 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 1082 * </p> 1083 * 1084 * @param currentExpectedType the expected type of the current token 1085 * @param expectedTypeForNextTokens the expected type for the following tokens 1086 * @return true if the tokens did match, or false otherwise 1087 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1088 */ 1089 public boolean matches( int currentExpectedType, 1090 int... expectedTypeForNextTokens ) throws IllegalStateException { 1091 if (completed) return false; 1092 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1093 if (!iter.hasNext()) return false; 1094 Token token = iter.next(); 1095 if (currentExpectedType != ANY_TYPE && currentToken().type() != currentExpectedType) return false; 1096 for (int nextExpectedType : expectedTypeForNextTokens) { 1097 if (!iter.hasNext()) return false; 1098 token = iter.next(); 1099 if (nextExpectedType == ANY_TYPE) continue; 1100 if (token.type() != nextExpectedType) return false; 1101 } 1102 return true; 1103 } 1104 1105 /** 1106 * Determine if the next few tokens have the supplied types. 1107 * <p> 1108 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 1109 * </p> 1110 * 1111 * @param typesForNextTokens the expected type for each of the next tokens 1112 * @return true if the tokens did match, or false otherwise 1113 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1114 */ 1115 public boolean matches( int[] typesForNextTokens ) throws IllegalStateException { 1116 if (completed) return false; 1117 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1118 Token token = null; 1119 for (int nextExpectedType : typesForNextTokens) { 1120 if (!iter.hasNext()) return false; 1121 token = iter.next(); 1122 if (nextExpectedType == ANY_TYPE) continue; 1123 if (!token.matches(nextExpectedType)) return false; 1124 } 1125 return true; 1126 } 1127 1128 /** 1129 * Determine if the next token matches one of the supplied values. 1130 * 1131 * @param firstOption the first option for the value of the current token 1132 * @param additionalOptions the additional options for the value of the current token 1133 * @return true if the current token's value did match one of the suplied options, or false otherwise 1134 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1135 */ 1136 public boolean matchesAnyOf( String firstOption, 1137 String... additionalOptions ) throws IllegalStateException { 1138 if (completed) return false; 1139 Token current = currentToken(); 1140 if (current.matches(firstOption)) return true; 1141 for (String nextOption : additionalOptions) { 1142 if (current.matches(nextOption)) return true; 1143 } 1144 return false; 1145 } 1146 1147 /** 1148 * Determine if the next token matches one of the supplied values. 1149 * 1150 * @param options the options for the value of the current token 1151 * @return true if the current token's value did match one of the suplied options, or false otherwise 1152 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1153 */ 1154 public boolean matchesAnyOf( String[] options ) throws IllegalStateException { 1155 if (completed) return false; 1156 Token current = currentToken(); 1157 for (String option : options) { 1158 if (current.matches(option)) return true; 1159 } 1160 return false; 1161 } 1162 1163 /** 1164 * Determine if the next token matches one of the supplied values. 1165 * 1166 * @param options the options for the value of the current token 1167 * @return true if the current token's value did match one of the suplied options, or false otherwise 1168 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1169 */ 1170 public boolean matchesAnyOf( Iterable<String> options ) throws IllegalStateException { 1171 if (completed) return false; 1172 Token current = currentToken(); 1173 for (String option : options) { 1174 if (current.matches(option)) return true; 1175 } 1176 return false; 1177 } 1178 1179 /** 1180 * Determine if the next token have one of the supplied types. 1181 * 1182 * @param firstTypeOption the first option for the type of the current token 1183 * @param additionalTypeOptions the additional options for the type of the current token 1184 * @return true if the current token's type matched one of the supplied options, or false otherwise 1185 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1186 */ 1187 public boolean matchesAnyOf( int firstTypeOption, 1188 int... additionalTypeOptions ) throws IllegalStateException { 1189 if (completed) return false; 1190 int currentType = currentToken().type(); 1191 if (currentType == firstTypeOption) return true; 1192 for (int nextTypeOption : additionalTypeOptions) { 1193 if (currentType == nextTypeOption) return true; 1194 } 1195 return false; 1196 } 1197 1198 /** 1199 * Determine if the next token have one of the supplied types. 1200 * 1201 * @param typeOptions the options for the type of the current token 1202 * @return true if the current token's type matched one of the supplied options, or false otherwise 1203 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1204 */ 1205 public boolean matchesAnyOf( int[] typeOptions ) throws IllegalStateException { 1206 if (completed) return false; 1207 int currentType = currentToken().type(); 1208 for (int nextTypeOption : typeOptions) { 1209 if (currentType == nextTypeOption) return true; 1210 } 1211 return false; 1212 } 1213 1214 /** 1215 * Determine if this stream has another token to be consumed. 1216 * 1217 * @return true if there is another token ready for consumption, or false otherwise 1218 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1219 */ 1220 public boolean hasNext() { 1221 if (tokenIterator == null) { 1222 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeNext.text()); 1223 } 1224 return !completed; 1225 } 1226 1227 /** 1228 * {@inheritDoc} 1229 * 1230 * @see java.lang.Object#toString() 1231 */ 1232 @Override 1233 public String toString() { 1234 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1235 StringBuilder sb = new StringBuilder(); 1236 if (iter.hasNext()) { 1237 sb.append(iter.next()); 1238 int count = 1; 1239 while (iter.hasNext()) { 1240 if (count > 20) { 1241 sb.append(" ..."); 1242 break; 1243 } 1244 sb.append(" "); 1245 ++count; 1246 sb.append(iter.next()); 1247 } 1248 } 1249 return sb.toString(); 1250 } 1251 1252 private void moveToNextToken() { 1253 // And move the currentToken to the next token ... 1254 if (!tokenIterator.hasNext()) { 1255 completed = true; 1256 currentToken = null; 1257 } else { 1258 currentToken = tokenIterator.next(); 1259 } 1260 } 1261 1262 /** 1263 * Get the current token. 1264 * 1265 * @return the current token; never null 1266 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1267 * @throws NoSuchElementException if there are no more tokens 1268 */ 1269 final Token currentToken() throws IllegalStateException, NoSuchElementException { 1270 if (currentToken == null) { 1271 if (completed) { 1272 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1273 } 1274 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text()); 1275 } 1276 assert currentToken != null; 1277 return currentToken; 1278 } 1279 1280 /** 1281 * Gets the content string starting at the first position (inclusive) and continuing up to the end position (exclusive). 1282 * 1283 * @param starting the position marking the beginning of the desired content string. 1284 * @param end the position located directly after the returned content string; can be null, which means end of content 1285 * @return the content string; never null 1286 */ 1287 public String getContentBetween( Position starting, 1288 Position end ) { 1289 CheckArg.isNotNull(starting, "starting"); 1290 1291 int startIndex = starting.getIndexInContent(); 1292 int endIndex = inputString.length(); 1293 if (end != null) { 1294 endIndex = end.getIndexInContent(); 1295 } 1296 1297 if (startIndex >= endIndex) { 1298 throw new IllegalArgumentException(CommonI18n.endPositionMustBeGreaterThanStartingPosition.text(startIndex, endIndex)); 1299 } 1300 1301 return inputString.substring(startIndex, endIndex); 1302 } 1303 1304 /** 1305 * Get the previous token. This does not modify the state. 1306 * 1307 * @return the previous token; never null 1308 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1309 * @throws NoSuchElementException if there is no previous token 1310 */ 1311 final Token previousToken() throws IllegalStateException, NoSuchElementException { 1312 if (currentToken == null) { 1313 if (completed) { 1314 if (tokens.isEmpty()) { 1315 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1316 } 1317 return tokens.get(tokens.size() - 1); 1318 } 1319 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text()); 1320 } 1321 if (tokenIterator.previousIndex() == 0) { 1322 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1323 } 1324 return tokens.get(tokenIterator.previousIndex() - 1); 1325 } 1326 1327 String generateFragment() { 1328 // Find the current position ... 1329 assert currentToken != null; 1330 int startIndex = currentToken.startIndex(); 1331 return generateFragment(inputString, startIndex, 20, " ===>> "); 1332 } 1333 1334 /** 1335 * Utility method to generate a highlighted fragment of a particular point in the stream. 1336 * 1337 * @param content the content from which the fragment should be taken; may not be null 1338 * @param indexOfProblem the index of the problem point that should be highlighted; must be a valid index in the content 1339 * @param charactersToIncludeBeforeAndAfter the maximum number of characters before and after the problem point to include in 1340 * the fragment 1341 * @param highlightText the text that should be included in the fragment at the problem point to highlight the location, or an 1342 * empty string if there should be no highlighting 1343 * @return the highlighted fragment; never null 1344 */ 1345 static String generateFragment( String content, 1346 int indexOfProblem, 1347 int charactersToIncludeBeforeAndAfter, 1348 String highlightText ) { 1349 assert content != null; 1350 assert indexOfProblem < content.length(); 1351 // Find the substring that immediately precedes the current position ... 1352 int beforeStart = Math.max(0, indexOfProblem - charactersToIncludeBeforeAndAfter); 1353 String before = content.substring(beforeStart, indexOfProblem); 1354 1355 // Find the substring that immediately follows the current position ... 1356 int afterEnd = Math.min(indexOfProblem + charactersToIncludeBeforeAndAfter, content.length()); 1357 String after = content.substring(indexOfProblem, afterEnd); 1358 1359 return before + (highlightText != null ? highlightText : "") + after; 1360 } 1361 1362 /** 1363 * Interface for a Tokenizer component responsible for processing the characters in a {@link CharacterStream} and constructing 1364 * the appropriate {@link Token} objects. 1365 */ 1366 public static interface Tokenizer { 1367 /** 1368 * Process the supplied characters and construct the appropriate {@link Token} objects. 1369 * 1370 * @param input the character input stream; never null 1371 * @param tokens the factory for {@link Token} objects, which records the order in which the tokens are created 1372 * @throws ParsingException if there is an error while processing the character stream (e.g., a quote is not closed, etc.) 1373 */ 1374 void tokenize( CharacterStream input, 1375 Tokens tokens ) throws ParsingException; 1376 } 1377 1378 /** 1379 * Interface used by a {@link Tokenizer} to iterate through the characters in the content input to the {@link TokenStream}. 1380 */ 1381 public static interface CharacterStream { 1382 1383 /** 1384 * Determine if there is another character available in this stream. 1385 * 1386 * @return true if there is another character (and {@link #next()} can be called), or false otherwise 1387 */ 1388 boolean hasNext(); 1389 1390 /** 1391 * Obtain the next character value, and advance the stream. 1392 * 1393 * @return the next character 1394 * @throws NoSuchElementException if there is no {@link #hasNext() next character} 1395 */ 1396 char next(); 1397 1398 /** 1399 * Get the index for the last character returned from {@link #next()}. 1400 * 1401 * @return the index of the last character returned 1402 */ 1403 int index(); 1404 1405 /** 1406 * Get the position for the last character returned from {@link #next()}. 1407 * 1408 * @param startIndex 1409 * @return the position of the last character returned; never null 1410 */ 1411 Position position( int startIndex ); 1412 1413 /** 1414 * Determine if the next character on the sream is a {@link Character#isWhitespace(char) whitespace character}. This 1415 * method does <i>not</i> advance the stream. 1416 * 1417 * @return true if there is a {@link #next() next} character and it is a whitespace character, or false otherwise 1418 */ 1419 boolean isNextWhitespace(); 1420 1421 /** 1422 * Determine if the next character on the sream is a {@link Character#isLetterOrDigit(char) letter or digit}. This method 1423 * does <i>not</i> advance the stream. 1424 * 1425 * @return true if there is a {@link #next() next} character and it is a letter or digit, or false otherwise 1426 */ 1427 boolean isNextLetterOrDigit(); 1428 1429 /** 1430 * Determine if the next character on the sream is a {@link XmlCharacters#isValid(int) valid XML character}. This method 1431 * does <i>not</i> advance the stream. 1432 * 1433 * @return true if there is a {@link #next() next} character and it is a valid XML character, or false otherwise 1434 */ 1435 boolean isNextValidXmlCharacter(); 1436 1437 /** 1438 * Determine if the next character on the sream is a {@link XmlCharacters#isValidName(int) valid XML NCName character}. 1439 * This method does <i>not</i> advance the stream. 1440 * 1441 * @return true if there is a {@link #next() next} character and it is a valid XML Name character, or false otherwise 1442 */ 1443 boolean isNextValidXmlNameCharacter(); 1444 1445 /** 1446 * Determine if the next character on the sream is a {@link XmlCharacters#isValidNcName(int) valid XML NCName character}. 1447 * This method does <i>not</i> advance the stream. 1448 * 1449 * @return true if there is a {@link #next() next} character and it is a valid XML NCName character, or false otherwise 1450 */ 1451 boolean isNextValidXmlNcNameCharacter(); 1452 1453 /** 1454 * Determine if the next character on the sream is the supplied value. This method does <i>not</i> advance the stream. 1455 * 1456 * @param c the character value to compare to the next character on the stream 1457 * @return true if there is a {@link #next() next} character and it is the supplied character, or false otherwise 1458 */ 1459 boolean isNext( char c ); 1460 1461 /** 1462 * Determine if the next two characters on the stream match the supplied values. This method does <i>not</i> advance the 1463 * stream. 1464 * 1465 * @param nextChar the character value to compare to the next character on the stream 1466 * @param followingChar the character value to compare to the character immediately after the next character on the stream 1467 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and 1468 * the second matches <code>followingChar</code> 1469 */ 1470 boolean isNext( char nextChar, 1471 char followingChar ); 1472 1473 /** 1474 * Determine if the next three characters on the sream match the supplied values. This method does <i>not</i> advance the 1475 * stream. 1476 * 1477 * @param nextChar the character value to compare to the next character on the stream 1478 * @param nextChar2 the character value to compare to the second character on the stream 1479 * @param nextChar3 the character value to compare to the second character on the stream 1480 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and 1481 * the second matches <code>followingChar</code> 1482 */ 1483 boolean isNext( char nextChar, 1484 char nextChar2, 1485 char nextChar3 ); 1486 1487 /** 1488 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i> 1489 * advance the stream. 1490 * 1491 * @param characters the characters to match 1492 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false 1493 * otherwise 1494 */ 1495 boolean isNextAnyOf( char[] characters ); 1496 1497 /** 1498 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i> 1499 * advance the stream. 1500 * 1501 * @param characters the characters to match 1502 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false 1503 * otherwise 1504 */ 1505 boolean isNextAnyOf( String characters ); 1506 1507 } 1508 1509 /** 1510 * A factory for Token objects, used by a {@link Tokenizer} to create tokens in the correct order. 1511 */ 1512 public static interface Tokens { 1513 /** 1514 * Create a single-character token at the supplied index in the character stream. The token type is set to 0, meaning this 1515 * is equivalent to calling <code>addToken(index,index+1)</code> or <code>addToken(index,index+1,0)</code>. 1516 * 1517 * @param position the position (line and column numbers) of this new token; may not be null 1518 * @param index the index of the character to appear in the token; must be a valid index in the stream 1519 */ 1520 void addToken( Position position, 1521 int index ); 1522 1523 /** 1524 * Create a single- or multi-character token with the characters in the range given by the starting and ending index in 1525 * the character stream. The character at the ending index is <i>not</i> included in the token (as this is standard 1526 * practice when using 0-based indexes). The token type is set to 0, meaning this is equivalent to calling <code> 1527 * addToken(startIndex,endIndex,0)</code> . 1528 * 1529 * @param position the position (line and column numbers) of this new token; may not be null 1530 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream 1531 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream 1532 */ 1533 void addToken( Position position, 1534 int startIndex, 1535 int endIndex ); 1536 1537 /** 1538 * Create a single- or multi-character token with the supplied type and with the characters in the range given by the 1539 * starting and ending index in the character stream. The character at the ending index is <i>not</i> included in the 1540 * token (as this is standard practice when using 0-based indexes). 1541 * 1542 * @param position the position (line and column numbers) of this new token; may not be null 1543 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream 1544 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream 1545 * @param type the type of the token 1546 */ 1547 void addToken( Position position, 1548 int startIndex, 1549 int endIndex, 1550 int type ); 1551 } 1552 1553 /** 1554 * The interface defining a token, which references the characters in the actual input character stream. 1555 * 1556 * @see CaseSensitiveTokenFactory 1557 * @see CaseInsensitiveTokenFactory 1558 */ 1559 @Immutable 1560 public interface Token { 1561 /** 1562 * Get the value of the token, in actual case. 1563 * 1564 * @return the value 1565 */ 1566 String value(); 1567 1568 /** 1569 * Determine if the token matches the supplied string. 1570 * 1571 * @param expected the expected value 1572 * @return true if the token's value matches the supplied value, or false otherwise 1573 */ 1574 boolean matches( String expected ); 1575 1576 /** 1577 * Determine if the token matches the supplied character. 1578 * 1579 * @param expected the expected character value 1580 * @return true if the token's value matches the supplied character value, or false otherwise 1581 */ 1582 boolean matches( char expected ); 1583 1584 /** 1585 * Determine if the token matches the supplied type. 1586 * 1587 * @param expectedType the expected integer type 1588 * @return true if the token's value matches the supplied integer type, or false otherwise 1589 */ 1590 boolean matches( int expectedType ); 1591 1592 /** 1593 * Get the type of the token. 1594 * 1595 * @return the token's type 1596 */ 1597 int type(); 1598 1599 /** 1600 * Get the index in the raw stream for the first character in the token. 1601 * 1602 * @return the starting index of the token 1603 */ 1604 int startIndex(); 1605 1606 /** 1607 * Get the index in the raw stream past the last character in the token. 1608 * 1609 * @return the ending index of the token, which is past the last character 1610 */ 1611 int endIndex(); 1612 1613 /** 1614 * Get the length of the token, which is equivalent to <code>endIndex() - startIndex()</code>. 1615 * 1616 * @return the length 1617 */ 1618 int length(); 1619 1620 /** 1621 * Get the position of this token, which includes the line number and column number of the first character in the token. 1622 * 1623 * @return the position; never null 1624 */ 1625 Position position(); 1626 1627 /** 1628 * Bitmask ORed with existing type value. 1629 * 1630 * @param typeMask 1631 * @return copy of Token with new type 1632 */ 1633 Token withType( int typeMask ); 1634 } 1635 1636 /** 1637 * An immutable {@link Token} that implements matching using case-sensitive logic. 1638 */ 1639 @Immutable 1640 protected class CaseSensitiveToken implements Token { 1641 private final int startIndex; 1642 private final int endIndex; 1643 private final int type; 1644 private final Position position; 1645 1646 public CaseSensitiveToken( int startIndex, 1647 int endIndex, 1648 int type, 1649 Position position ) { 1650 this.startIndex = startIndex; 1651 this.endIndex = endIndex; 1652 this.type = type; 1653 this.position = position; 1654 } 1655 1656 /** 1657 * {@inheritDoc} 1658 * 1659 * @see org.modeshape.common.text.TokenStream.Token#withType(int) 1660 */ 1661 public Token withType( int typeMask ) { 1662 int type = this.type | typeMask; 1663 return new CaseSensitiveToken(startIndex, endIndex, type, position); 1664 } 1665 1666 /** 1667 * {@inheritDoc} 1668 * 1669 * @see org.modeshape.common.text.TokenStream.Token#type() 1670 */ 1671 public final int type() { 1672 return type; 1673 } 1674 1675 /** 1676 * {@inheritDoc} 1677 * 1678 * @see org.modeshape.common.text.TokenStream.Token#startIndex() 1679 */ 1680 public final int startIndex() { 1681 return startIndex; 1682 } 1683 1684 /** 1685 * {@inheritDoc} 1686 * 1687 * @see org.modeshape.common.text.TokenStream.Token#endIndex() 1688 */ 1689 public final int endIndex() { 1690 return endIndex; 1691 } 1692 1693 /** 1694 * {@inheritDoc} 1695 * 1696 * @see org.modeshape.common.text.TokenStream.Token#length() 1697 */ 1698 public final int length() { 1699 return endIndex - startIndex; 1700 } 1701 1702 /** 1703 * {@inheritDoc} 1704 * 1705 * @see org.modeshape.common.text.TokenStream.Token#matches(char) 1706 */ 1707 public final boolean matches( char expected ) { 1708 return length() == 1 && matchString().charAt(startIndex) == expected; 1709 } 1710 1711 /** 1712 * {@inheritDoc} 1713 * 1714 * @see org.modeshape.common.text.TokenStream.Token#matches(java.lang.String) 1715 */ 1716 public final boolean matches( String expected ) { 1717 return matchString().substring(startIndex, endIndex).equals(expected); 1718 } 1719 1720 /** 1721 * {@inheritDoc} 1722 * 1723 * @see org.modeshape.common.text.TokenStream.Token#matches(int) 1724 */ 1725 public final boolean matches( int expectedType ) { 1726 return expectedType == ANY_TYPE || (currentToken().type() & expectedType) == expectedType; 1727 } 1728 1729 /** 1730 * {@inheritDoc} 1731 * 1732 * @see org.modeshape.common.text.TokenStream.Token#value() 1733 */ 1734 public final String value() { 1735 return inputString.substring(startIndex, endIndex); 1736 } 1737 1738 /** 1739 * {@inheritDoc} 1740 * 1741 * @see org.modeshape.common.text.TokenStream.Token#position() 1742 */ 1743 public Position position() { 1744 return position; 1745 } 1746 1747 protected String matchString() { 1748 return inputString; 1749 } 1750 1751 /** 1752 * {@inheritDoc} 1753 * 1754 * @see java.lang.Object#toString() 1755 */ 1756 @Override 1757 public String toString() { 1758 return value(); 1759 } 1760 } 1761 1762 @Immutable 1763 protected class CaseInsensitiveToken extends CaseSensitiveToken { 1764 public CaseInsensitiveToken( int startIndex, 1765 int endIndex, 1766 int type, 1767 Position position ) { 1768 super(startIndex, endIndex, type, position); 1769 } 1770 1771 /** 1772 * {@inheritDoc} 1773 * 1774 * @see org.modeshape.common.text.TokenStream.CaseSensitiveToken#matchString() 1775 */ 1776 @Override 1777 protected String matchString() { 1778 return inputUppercased; 1779 } 1780 1781 /** 1782 * {@inheritDoc} 1783 * 1784 * @see org.modeshape.common.text.TokenStream.Token#withType(int) 1785 */ 1786 @Override 1787 public Token withType( int typeMask ) { 1788 int type = this.type() | typeMask; 1789 return new CaseInsensitiveToken(startIndex(), endIndex(), type, position()); 1790 } 1791 } 1792 1793 protected abstract class TokenFactory implements Tokens { 1794 protected final List<Token> tokens = new ArrayList<Token>(); 1795 1796 /** 1797 * {@inheritDoc} 1798 * 1799 * @see org.modeshape.common.text.TokenStream.Tokens#addToken(Position, int) 1800 */ 1801 public void addToken( Position position, 1802 int index ) { 1803 addToken(position, index, index + 1, 0); 1804 } 1805 1806 /** 1807 * {@inheritDoc} 1808 * 1809 * @see org.modeshape.common.text.TokenStream.Tokens#addToken(Position, int, int) 1810 */ 1811 public final void addToken( Position position, 1812 int startIndex, 1813 int endIndex ) { 1814 addToken(position, startIndex, endIndex, 0); 1815 } 1816 1817 /** 1818 * @return tokens 1819 */ 1820 public List<Token> getTokens() { 1821 return tokens; 1822 } 1823 } 1824 1825 public class CaseSensitiveTokenFactory extends TokenFactory { 1826 /** 1827 * {@inheritDoc} 1828 * 1829 * @see org.modeshape.common.text.TokenStream.TokenFactory#addToken(Position,int, int, int) 1830 */ 1831 public void addToken( Position position, 1832 int startIndex, 1833 int endIndex, 1834 int type ) { 1835 tokens.add(new CaseSensitiveToken(startIndex, endIndex, type, position)); 1836 } 1837 } 1838 1839 public class CaseInsensitiveTokenFactory extends TokenFactory { 1840 /** 1841 * {@inheritDoc} 1842 * 1843 * @see org.modeshape.common.text.TokenStream.TokenFactory#addToken(Position,int, int, int) 1844 */ 1845 public void addToken( Position position, 1846 int startIndex, 1847 int endIndex, 1848 int type ) { 1849 tokens.add(new CaseInsensitiveToken(startIndex, endIndex, type, position)); 1850 } 1851 } 1852 1853 /** 1854 * An implementation of {@link CharacterStream} that works with a single character array. 1855 */ 1856 public static final class CharacterArrayStream implements CharacterStream { 1857 private final char[] content; 1858 private int lastIndex = -1; 1859 private final int maxIndex; 1860 private int lineNumber = 1; 1861 private int columnNumber = 0; 1862 private boolean nextCharMayBeLineFeed; 1863 1864 public CharacterArrayStream( char[] content ) { 1865 this.content = content; 1866 this.maxIndex = content.length - 1; 1867 } 1868 1869 /** 1870 * {@inheritDoc} 1871 * 1872 * @see org.modeshape.common.text.TokenStream.CharacterStream#hasNext() 1873 */ 1874 public boolean hasNext() { 1875 return lastIndex < maxIndex; 1876 } 1877 1878 /** 1879 * {@inheritDoc} 1880 * 1881 * @see org.modeshape.common.text.TokenStream.CharacterStream#index() 1882 */ 1883 public int index() { 1884 return lastIndex; 1885 } 1886 1887 /** 1888 * {@inheritDoc} 1889 * 1890 * @param startIndex 1891 * @return the position of the token. never null 1892 * @see org.modeshape.common.text.TokenStream.CharacterStream#position(int) 1893 */ 1894 public Position position( int startIndex ) { 1895 return new Position(startIndex, lineNumber, columnNumber); 1896 } 1897 1898 /** 1899 * {@inheritDoc} 1900 * 1901 * @see org.modeshape.common.text.TokenStream.CharacterStream#next() 1902 */ 1903 public char next() { 1904 if (lastIndex >= maxIndex) { 1905 throw new NoSuchElementException(); 1906 } 1907 char result = content[++lastIndex]; 1908 ++columnNumber; 1909 if (result == '\r') { 1910 nextCharMayBeLineFeed = true; 1911 ++lineNumber; 1912 columnNumber = 0; 1913 } else if (result == '\n') { 1914 if (!nextCharMayBeLineFeed) ++lineNumber; 1915 columnNumber = 0; 1916 } else if (nextCharMayBeLineFeed) { 1917 nextCharMayBeLineFeed = false; 1918 } 1919 return result; 1920 } 1921 1922 /** 1923 * {@inheritDoc} 1924 * 1925 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char) 1926 */ 1927 public boolean isNext( char c ) { 1928 int nextIndex = lastIndex + 1; 1929 return nextIndex <= maxIndex && content[nextIndex] == c; 1930 } 1931 1932 /** 1933 * {@inheritDoc} 1934 * 1935 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char, char) 1936 */ 1937 public boolean isNext( char nextChar1, 1938 char nextChar2 ) { 1939 int nextIndex1 = lastIndex + 1; 1940 int nextIndex2 = lastIndex + 2; 1941 return nextIndex2 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2; 1942 } 1943 1944 /** 1945 * {@inheritDoc} 1946 * 1947 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNext(char, char, char) 1948 */ 1949 public boolean isNext( char nextChar1, 1950 char nextChar2, 1951 char nextChar3 ) { 1952 int nextIndex1 = lastIndex + 1; 1953 int nextIndex2 = lastIndex + 2; 1954 int nextIndex3 = lastIndex + 3; 1955 return nextIndex3 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2 1956 && content[nextIndex3] == nextChar3; 1957 } 1958 1959 /** 1960 * {@inheritDoc} 1961 * 1962 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextAnyOf(char[]) 1963 */ 1964 public boolean isNextAnyOf( char[] characters ) { 1965 int nextIndex = lastIndex + 1; 1966 if (nextIndex <= maxIndex) { 1967 char nextChar = content[lastIndex + 1]; 1968 for (char c : characters) { 1969 if (c == nextChar) return true; 1970 } 1971 } 1972 return false; 1973 } 1974 1975 /** 1976 * {@inheritDoc} 1977 * 1978 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextAnyOf(java.lang.String) 1979 */ 1980 public boolean isNextAnyOf( String characters ) { 1981 int nextIndex = lastIndex + 1; 1982 if (nextIndex <= maxIndex) { 1983 char nextChar = content[lastIndex + 1]; 1984 if (characters.indexOf(nextChar) != -1) return true; 1985 } 1986 return false; 1987 } 1988 1989 /** 1990 * {@inheritDoc} 1991 * 1992 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextWhitespace() 1993 */ 1994 public boolean isNextWhitespace() { 1995 int nextIndex = lastIndex + 1; 1996 return nextIndex <= maxIndex && Character.isWhitespace(content[nextIndex]); 1997 } 1998 1999 /** 2000 * {@inheritDoc} 2001 * 2002 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextLetterOrDigit() 2003 */ 2004 public boolean isNextLetterOrDigit() { 2005 int nextIndex = lastIndex + 1; 2006 return nextIndex <= maxIndex && Character.isLetterOrDigit(content[nextIndex]); 2007 } 2008 2009 /** 2010 * {@inheritDoc} 2011 * 2012 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlCharacter() 2013 */ 2014 public boolean isNextValidXmlCharacter() { 2015 int nextIndex = lastIndex + 1; 2016 return nextIndex <= maxIndex && XmlCharacters.isValid(content[nextIndex]); 2017 } 2018 2019 /** 2020 * {@inheritDoc} 2021 * 2022 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlNameCharacter() 2023 */ 2024 public boolean isNextValidXmlNameCharacter() { 2025 int nextIndex = lastIndex + 1; 2026 return nextIndex <= maxIndex && XmlCharacters.isValidName(content[nextIndex]); 2027 } 2028 2029 /** 2030 * {@inheritDoc} 2031 * 2032 * @see org.modeshape.common.text.TokenStream.CharacterStream#isNextValidXmlNcNameCharacter() 2033 */ 2034 public boolean isNextValidXmlNcNameCharacter() { 2035 int nextIndex = lastIndex + 1; 2036 return nextIndex <= maxIndex && XmlCharacters.isValidNcName(content[nextIndex]); 2037 } 2038 } 2039 2040 /** 2041 * Obtain a basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the 2042 * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments. 2043 * <p> 2044 * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for 2045 * those situations that happen to be able to use it. 2046 * </p> 2047 * 2048 * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments 2049 * should be stripped and not included in the token stream 2050 * @return the tokenizer; never null 2051 */ 2052 public static BasicTokenizer basicTokenizer( boolean includeComments ) { 2053 return new BasicTokenizer(includeComments); 2054 } 2055 2056 /** 2057 * A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period 2058 * ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments. 2059 * <p> 2060 * Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations 2061 * that happen to be able to use it. 2062 * </p> 2063 */ 2064 public static class BasicTokenizer implements Tokenizer { 2065 /** 2066 * The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made 2067 * up of non-whitespace and non-symbol characters. 2068 */ 2069 public static final int WORD = 1; 2070 /** 2071 * The {@link Token#type() token type} for tokens that consist of an individual "symbol" character. The set of characters 2072 * includes: <code>-(){}*,;+%?$[]!<>|=:</code> 2073 */ 2074 public static final int SYMBOL = 2; 2075 /** 2076 * The {@link Token#type() token type} for tokens that consist of an individual '.' character. 2077 */ 2078 public static final int DECIMAL = 4; 2079 /** 2080 * The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote 2081 * characters are included if they are preceded (escaped) by a '\' character. 2082 */ 2083 public static final int SINGLE_QUOTED_STRING = 8; 2084 /** 2085 * The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote 2086 * characters are included if they are preceded (escaped) by a '\' character. 2087 */ 2088 public static final int DOUBLE_QUOTED_STRING = 16; 2089 /** 2090 * The {@link Token#type() token type} for tokens that consist of all the characters between "/*" and "*/" or between 2091 * "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"). 2092 */ 2093 public static final int COMMENT = 32; 2094 2095 private final boolean useComments; 2096 2097 protected BasicTokenizer( boolean useComments ) { 2098 this.useComments = useComments; 2099 } 2100 2101 /** 2102 * {@inheritDoc} 2103 * 2104 * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens) 2105 */ 2106 public void tokenize( CharacterStream input, 2107 Tokens tokens ) throws ParsingException { 2108 while (input.hasNext()) { 2109 char c = input.next(); 2110 switch (c) { 2111 case ' ': 2112 case '\t': 2113 case '\n': 2114 case '\r': 2115 // Just skip these whitespace characters ... 2116 break; 2117 case '-': 2118 case '(': 2119 case ')': 2120 case '{': 2121 case '}': 2122 case '*': 2123 case ',': 2124 case ';': 2125 case '+': 2126 case '%': 2127 case '?': 2128 case '$': 2129 case '[': 2130 case ']': 2131 case '!': 2132 case '<': 2133 case '>': 2134 case '|': 2135 case '=': 2136 case ':': 2137 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL); 2138 break; 2139 case '.': 2140 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL); 2141 break; 2142 case '\"': 2143 int startIndex = input.index(); 2144 Position startingPosition = input.position(startIndex); 2145 boolean foundClosingQuote = false; 2146 while (input.hasNext()) { 2147 c = input.next(); 2148 if (c == '\\' && input.isNext('"')) { 2149 c = input.next(); // consume the ' character since it is escaped 2150 } else if (c == '"') { 2151 foundClosingQuote = true; 2152 break; 2153 } 2154 } 2155 if (!foundClosingQuote) { 2156 String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(), 2157 startingPosition.getColumn()); 2158 throw new ParsingException(startingPosition, msg); 2159 } 2160 int endIndex = input.index() + 1; // beyond last character read 2161 tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING); 2162 break; 2163 case '\'': 2164 startIndex = input.index(); 2165 startingPosition = input.position(startIndex); 2166 foundClosingQuote = false; 2167 while (input.hasNext()) { 2168 c = input.next(); 2169 if (c == '\\' && input.isNext('\'')) { 2170 c = input.next(); // consume the ' character since it is escaped 2171 } else if (c == '\'') { 2172 foundClosingQuote = true; 2173 break; 2174 } 2175 } 2176 if (!foundClosingQuote) { 2177 String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(), 2178 startingPosition.getColumn()); 2179 throw new ParsingException(startingPosition, msg); 2180 } 2181 endIndex = input.index() + 1; // beyond last character read 2182 tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING); 2183 break; 2184 case '/': 2185 startIndex = input.index(); 2186 startingPosition = input.position(startIndex); 2187 if (input.isNext('/')) { 2188 // End-of-line comment ... 2189 boolean foundLineTerminator = false; 2190 while (input.hasNext()) { 2191 c = input.next(); 2192 if (c == '\n' || c == '\r') { 2193 foundLineTerminator = true; 2194 break; 2195 } 2196 } 2197 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 2198 if (!foundLineTerminator) ++endIndex; // must point beyond last char 2199 if (c == '\r' && input.isNext('\n')) input.next(); 2200 if (useComments) { 2201 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 2202 } 2203 } else if (input.isNext('*')) { 2204 // Multi-line comment ... 2205 while (input.hasNext() && !input.isNext('*', '/')) { 2206 c = input.next(); 2207 } 2208 if (input.hasNext()) input.next(); // consume the '*' 2209 if (input.hasNext()) input.next(); // consume the '/' 2210 if (useComments) { 2211 endIndex = input.index() + 1; // the token will include the '/' and '*' characters 2212 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 2213 } 2214 } else { 2215 // just a regular slash ... 2216 tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL); 2217 } 2218 break; 2219 default: 2220 startIndex = input.index(); 2221 startingPosition = input.position(startIndex); 2222 // Read until another whitespace/symbol/decimal/slash is found 2223 while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) { 2224 c = input.next(); 2225 } 2226 endIndex = input.index() + 1; // beyond last character that was included 2227 tokens.addToken(startingPosition, startIndex, endIndex, WORD); 2228 } 2229 } 2230 } 2231 } 2232 }