View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6    * See the AUTHORS.txt file in the distribution for a full listing of 
7    * individual contributors.
8    *
9    * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10   * is licensed to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   * 
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  package org.modeshape.sequencer.ddl;
25  
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.HashSet;
29  import java.util.List;
30  import java.util.Set;
31  import org.modeshape.common.CommonI18n;
32  import org.modeshape.common.text.ParsingException;
33  import org.modeshape.common.text.Position;
34  import org.modeshape.common.text.TokenStream;
35  
36  /**
37   * A TokenStream implementation designed around requirements for tokenizing and parsing DDL statements.
38   * <p>
39   * Because of the complexity of DDL, it was necessary to extend {@link TokenStream} in order to override the basic tokenizer to
40   * tokenize the in-line comments prefixed with "--". In addition, because there is not a default ddl command (or statement)
41   * terminator, an override method was added to {@link TokenStream} to allow re-tokenizing the initial tokens to re-type the
42   * tokens, remove tokens, or any other operation to simplify parsing.
43   * </p>
44   * <p>
45   * In this case, both reserved words (or key words) and statement start phrases can be registered prior to the {@link TokenStream}
46   * 's start() method. Any resulting tokens that match the registered string values will be re-typed to identify them as key words
47   * (DdlTokenizer.KEYWORD) or statement start phrases (DdlTokenizer.STATEMENT_KEY).
48   * </p>
49   */
50  public class DdlTokenStream extends TokenStream {
51  
52      protected List<String[]> registeredStatementStartPhrases = new ArrayList<String[]>();
53  
54      protected Set<String> registeredKeyWords = new HashSet<String>();
55  
56      private Position currentMarkedPosition = Position.EMPTY_CONTENT_POSITION;
57  
58      /**
59       * {@inheritDoc}
60       * 
61       * @see org.modeshape.common.text.TokenStream#initializeTokens(java.util.List)
62       */
63      @Override
64      protected List<Token> initializeTokens( List<Token> tokens ) {
65          // THIS IS WHERE WE DO THE WORK OF PRE-PARSING TOKENS AND REPLACING KEYWORDS AND STATEMENT STARTS WITH
66          // APPLICABLE TOKEN TYPE BITMASK VALUES
67          // MyClass[] array = (MyClass[])list.toArray(new MyClass[list.size()]);
68  
69          Token[] tokensArray = tokens.toArray(new Token[tokens.size()]);
70          List<Token> reTypedTokens = new ArrayList<Token>(tokens.size());
71  
72          for (int i = 0; i < tokensArray.length; i++) {
73              boolean isStatementStart = false;
74              if (isKeyWord(tokensArray[i].value())) {
75                  Token retypedToken = tokensArray[i].withType(DdlTokenizer.KEYWORD);
76                  // Now we check to see if this keyword begins a registered statement start
77  
78                  // Keep track of token increment (# of tokens for a phrase)
79                  // Need to increment iterator (i) in case phrases like "ALTER ROLLBACK" appear. ROLLBACK is also a statement
80                  // start phrase and we need to walk ignore ROLLBACK in this case.
81                  int tokenIncrement = 0;
82                  for (String[] nextStmtStart : registeredStatementStartPhrases) {
83                      boolean matches = true;
84  
85                      for (int j = 0; j < nextStmtStart.length; j++) {
86                          if (matches) {
87                              matches = nextStmtStart[j].equalsIgnoreCase(tokensArray[i + j].value())
88                                        || nextStmtStart[j].equals(ANY_VALUE);
89                          }
90                      }
91                      if (matches) {
92                          isStatementStart = true;
93                          tokenIncrement = nextStmtStart.length - 1;
94                          break;
95                      }
96                  }
97                  if (isStatementStart) {
98                      retypedToken = retypedToken.withType(DdlTokenizer.STATEMENT_KEY);
99                  }
100                 reTypedTokens.add(retypedToken);
101 
102                 if (isStatementStart) {
103                     // Copy any additional tokens used in the phrase
104                     for (int k = 0; k < tokenIncrement; k++) {
105                         i++;
106                         reTypedTokens.add(tokensArray[i]);
107                     }
108                 }
109             } else {
110                 reTypedTokens.add(tokensArray[i]);
111             }
112 
113         }
114 
115         return reTypedTokens;
116     }
117 
118     /**
119      * @param content
120      * @param tokenizer
121      * @param caseSensitive
122      */
123     public DdlTokenStream( String content,
124                            Tokenizer tokenizer,
125                            boolean caseSensitive ) {
126         super(content, tokenizer, caseSensitive);
127     }
128 
129     /**
130      * Register a phrase representing the start of a DDL statement
131      * <p>
132      * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
133      * </p>
134      * see {@link DdlConstants} for the default SQL 92 representations.
135      * 
136      * @param phrase
137      */
138     public void registerStatementStartPhrase( String[] phrase ) {
139         registeredStatementStartPhrases.add(phrase);
140     }
141 
142     public void registerStatementStartPhrase( String[][] phrases ) {
143         for (String[] phrase : phrases) {
144             registeredStatementStartPhrases.add(phrase);
145         }
146     }
147 
148     /**
149      * Register a single key word.
150      * 
151      * @param keyWord
152      */
153     public void registerKeyWord( String keyWord ) {
154         registeredKeyWords.add(keyWord);
155     }
156 
157     /**
158      * Register an {@link List} of key words.
159      * 
160      * @param keyWords
161      */
162     public void registerKeyWords( List<String> keyWords ) {
163         registeredKeyWords.addAll(keyWords);
164     }
165 
166     /**
167      * Register an array of key words.
168      * 
169      * @param keyWords
170      */
171     public void registerKeyWords( String[] keyWords ) {
172         registeredKeyWords.addAll(Arrays.asList(keyWords));
173     }
174 
175     /**
176      * @param word
177      * @return is Key Word
178      */
179     protected boolean isKeyWord( String word ) {
180         return registeredKeyWords.contains(word.toUpperCase());
181     }
182 
183     /**
184      * Method to determine if the next token is of type {@link DdlTokenizer} KEYWORD.
185      * 
186      * @return is Key Word
187      */
188     public boolean isNextKeyWord() {
189         return this.matches(DdlTokenizer.KEYWORD);
190     }
191 
192     /**
193      * Method to determine if next tokens match a registered statement start phrase.
194      * 
195      * @return true if next tokens match a registered statement start phrase
196      */
197     public boolean isNextStatementStart() {
198         boolean result = false;
199 
200         if (isNextKeyWord()) {
201             for (String[] nextStmtStart : registeredStatementStartPhrases) {
202                 if (this.matches(nextStmtStart)) {
203                     return true;
204                 }
205             }
206         }
207 
208         return result;
209     }
210 
211     /**
212      * Marks the current position (line & column number) of the currentToken
213      */
214     public void mark() {
215         if (this.hasNext()) {
216             currentMarkedPosition = this.nextPosition();
217         } else {
218             currentMarkedPosition = null;
219         }
220 
221     }
222 
223     /**
224      * Returns the string content for characters bounded by the previous marked position and the position of the currentToken
225      * (inclusive). Method also marks() the new position the the currentToken.
226      * 
227      * @return the string content for characters bounded by the previous marked position and the position of the currentToken
228      *         (inclusive).
229      */
230     public String getMarkedContent() {
231         Position startPosition = new Position(currentMarkedPosition.getIndexInContent(), currentMarkedPosition.getLine(),
232                                               currentMarkedPosition.getColumn());
233 
234         mark();
235 
236         return getContentBetween(startPosition, currentMarkedPosition);
237     }
238 
239     /**
240      * Obtain a ddl {@link DdlTokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the
241      * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
242      * <p>
243      * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for
244      * those situations that happen to be able to use it.
245      * </p>
246      * 
247      * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments
248      *        should be stripped and not included in the token stream
249      * @return the tokenizer; never null
250      */
251     public static DdlTokenizer ddlTokenizer( boolean includeComments ) {
252         return new DdlTokenizer(includeComments);
253     }
254 
255     public static class DdlTokenizer implements Tokenizer {
256         public static final String PARSER_ID = "PARSER_ID";
257 
258         /**
259          * The {@link TokenStream.Token#type() token type} for tokens that represent an unquoted string containing a character
260          * sequence made up of non-whitespace and non-symbol characters.
261          */
262         public static final int WORD = 1;
263         /**
264          * The {@link TokenStream.Token#type() token type} for tokens that consist of an individual "symbol" character. The set of
265          * characters includes: <code>-(){}*,;+%?$[]!<>|=:</code>
266          */
267         public static final int SYMBOL = 2;
268         /**
269          * The {@link TokenStream.Token#type() token type} for tokens that consist of an individual '.' character.
270          */
271         public static final int DECIMAL = 4;
272         /**
273          * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters within single-quotes.
274          * Single quote characters are included if they are preceded (escaped) by a '\' character.
275          */
276         public static final int SINGLE_QUOTED_STRING = 8;
277         /**
278          * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters within double-quotes.
279          * Double quote characters are included if they are preceded (escaped) by a '\' character.
280          */
281         public static final int DOUBLE_QUOTED_STRING = 16;
282         /**
283          * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters between "/*" and
284          * "&#42;/", between "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"), or between "--" and the next line
285          * terminator (e.g., '\n', '\r' or "\r\n").
286          */
287         public static final int COMMENT = 32;
288 
289         private final boolean useComments;
290 
291         /**
292          * The {@link TokenStream.Token#type() token type} for tokens that represent key words or reserved words for a given DDL
293          * dialect.
294          * <p>
295          * Examples would be: "CREATE", "TABLE", "ALTER", "SCHEMA", "DROP", etc...
296          * </p>
297          * see {@link DdlConstants} for the default SQL 92 representations.
298          */
299         public static final int KEYWORD = 64;
300 
301         /**
302          * The {@link TokenStream.Token#type() token type} for tokens that represent the start of a DDL statement.
303          * <p>
304          * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
305          * </p>
306          * see {@link DdlConstants} for the default SQL 92 representations.
307          */
308         public static final int STATEMENT_KEY = 128;
309 
310         public DdlTokenizer( boolean useComments ) {
311             this.useComments = useComments;
312         }
313 
314         /**
315          * @return useComments
316          */
317         public boolean includeComments() {
318             return useComments;
319         }
320 
321         /**
322          * {@inheritDoc}
323          * 
324          * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens)
325          */
326         public void tokenize( CharacterStream input,
327                               Tokens tokens ) throws ParsingException {
328             int startIndex;
329             int endIndex;
330             while (input.hasNext()) {
331                 char c = input.next();
332                 switch (c) {
333                     case ' ':
334                     case '\t':
335                     case '\n':
336                     case '\r':
337                         // Just skip these whitespace characters ...
338                         break;
339                     // ==============================================================================================
340                     // DDL Comments token = "--"
341                     // ==============================================================================================
342                     case '-': {
343                         startIndex = input.index();
344                         Position startPosition = input.position(startIndex);
345                         if (input.isNext('-')) {
346                             // -- END OF LINE comment ...
347                             boolean foundLineTerminator = false;
348                             while (input.hasNext()) {
349                                 c = input.next();
350                                 if (c == '\n' || c == '\r') {
351                                     foundLineTerminator = true;
352                                     break;
353                                 }
354                             }
355                             endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
356                             if (!foundLineTerminator) ++endIndex; // must point beyond last char
357                             if (c == '\r' && input.isNext('\n')) input.next();
358 
359                             // Check for PARSER_ID
360 
361                             if (useComments) {
362                                 tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
363                             }
364 
365                         } else {
366                             // just a regular dash ...
367                             tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
368                         }
369                         break;
370                     }
371                         // ==============================================================================================
372                     case '(':
373                     case ')':
374                     case '{':
375                     case '}':
376                     case '*':
377                     case ',':
378                     case ';':
379                     case '+':
380                     case '%':
381                     case '?':
382                     case '$':
383                     case '[':
384                     case ']':
385                     case '!':
386                     case '<':
387                     case '>':
388                     case '|':
389                     case '=':
390                     case ':':
391                         tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
392                         break;
393                     case '.':
394                         tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
395                         break;
396                     case '\"':
397                         startIndex = input.index();
398                         Position startingPosition = input.position(startIndex);
399                         boolean foundClosingQuote = false;
400                         while (input.hasNext()) {
401                             c = input.next();
402                             if (c == '\\' && input.isNext('"')) {
403                                 c = input.next(); // consume the ' character since it is escaped
404                             } else if (c == '"') {
405                                 foundClosingQuote = true;
406                                 break;
407                             }
408                         }
409                         if (!foundClosingQuote) {
410                             String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
411                                                                                     startingPosition.getColumn());
412                             throw new ParsingException(startingPosition, msg);
413                         }
414                         endIndex = input.index() + 1; // beyond last character read
415                         tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
416                         break;
417                     case '\u2019': // '’':
418                     case '\'':
419                         char quoteChar = c;
420                         startIndex = input.index();
421                         startingPosition = input.position(startIndex);
422                         foundClosingQuote = false;
423                         while (input.hasNext()) {
424                             c = input.next();
425                             if (c == '\\' && input.isNext(quoteChar)) {
426                                 c = input.next(); // consume the ' character since it is escaped
427                             } else if (c == quoteChar) {
428                                 foundClosingQuote = true;
429                                 break;
430                             }
431                         }
432                         if (!foundClosingQuote) {
433                             String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
434                                                                                     startingPosition.getColumn());
435                             throw new ParsingException(startingPosition, msg);
436                         }
437                         endIndex = input.index() + 1; // beyond last character read
438                         tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
439                         break;
440                     case '/':
441                         startIndex = input.index();
442                         startingPosition = input.position(startIndex);
443                         if (input.isNext('/')) {
444                             // End-of-line comment ...
445                             boolean foundLineTerminator = false;
446                             while (input.hasNext()) {
447                                 c = input.next();
448                                 if (c == '\n' || c == '\r') {
449                                     foundLineTerminator = true;
450                                     break;
451                                 }
452                             }
453                             endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
454                             if (!foundLineTerminator) ++endIndex; // must point beyond last char
455                             if (c == '\r' && input.isNext('\n')) input.next();
456                             if (useComments) {
457                                 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
458                             }
459 
460                         } else if (input.isNext('*')) {
461                             // Multi-line comment ...
462                             while (input.hasNext() && !input.isNext('*', '/')) {
463                                 c = input.next();
464                             }
465                             if (input.hasNext()) input.next(); // consume the '*'
466                             if (input.hasNext()) input.next(); // consume the '/'
467 
468                             endIndex = input.index() + 1; // the token will include the '/' and '*' characters
469                             if (useComments) {
470                                 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
471                             }
472 
473                         } else {
474                             // just a regular slash ...
475                             tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
476                         }
477                         break;
478                     default:
479                         startIndex = input.index();
480                         Position startPosition = input.position(startIndex);
481                         // Read until another whitespace/symbol/decimal/slash is found
482                         while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) {
483                             c = input.next();
484                         }
485                         endIndex = input.index() + 1; // beyond last character that was included
486                         tokens.addToken(startPosition, startIndex, endIndex, WORD);
487                 }
488             }
489         }
490     }
491 }