1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10 * is licensed to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24 package org.modeshape.sequencer.ddl;
25
26 import java.util.ArrayList;
27 import java.util.Arrays;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.Set;
31 import org.modeshape.common.CommonI18n;
32 import org.modeshape.common.text.ParsingException;
33 import org.modeshape.common.text.Position;
34 import org.modeshape.common.text.TokenStream;
35
36 /**
37 * A TokenStream implementation designed around requirements for tokenizing and parsing DDL statements.
38 * <p>
39 * Because of the complexity of DDL, it was necessary to extend {@link TokenStream} in order to override the basic tokenizer to
40 * tokenize the in-line comments prefixed with "--". In addition, because there is not a default ddl command (or statement)
41 * terminator, an override method was added to {@link TokenStream} to allow re-tokenizing the initial tokens to re-type the
42 * tokens, remove tokens, or any other operation to simplify parsing.
43 * </p>
44 * <p>
45 * In this case, both reserved words (or key words) and statement start phrases can be registered prior to the {@link TokenStream}
46 * 's start() method. Any resulting tokens that match the registered string values will be re-typed to identify them as key words
47 * (DdlTokenizer.KEYWORD) or statement start phrases (DdlTokenizer.STATEMENT_KEY).
48 * </p>
49 */
50 public class DdlTokenStream extends TokenStream {
51
52 protected List<String[]> registeredStatementStartPhrases = new ArrayList<String[]>();
53
54 protected Set<String> registeredKeyWords = new HashSet<String>();
55
56 private Position currentMarkedPosition = Position.EMPTY_CONTENT_POSITION;
57
58 /**
59 * {@inheritDoc}
60 *
61 * @see org.modeshape.common.text.TokenStream#initializeTokens(java.util.List)
62 */
63 @Override
64 protected List<Token> initializeTokens( List<Token> tokens ) {
65 // THIS IS WHERE WE DO THE WORK OF PRE-PARSING TOKENS AND REPLACING KEYWORDS AND STATEMENT STARTS WITH
66 // APPLICABLE TOKEN TYPE BITMASK VALUES
67 // MyClass[] array = (MyClass[])list.toArray(new MyClass[list.size()]);
68
69 Token[] tokensArray = tokens.toArray(new Token[tokens.size()]);
70 List<Token> reTypedTokens = new ArrayList<Token>(tokens.size());
71
72 for (int i = 0; i < tokensArray.length; i++) {
73 boolean isStatementStart = false;
74 if (isKeyWord(tokensArray[i].value())) {
75 Token retypedToken = tokensArray[i].withType(DdlTokenizer.KEYWORD);
76 // Now we check to see if this keyword begins a registered statement start
77
78 // Keep track of token increment (# of tokens for a phrase)
79 // Need to increment iterator (i) in case phrases like "ALTER ROLLBACK" appear. ROLLBACK is also a statement
80 // start phrase and we need to walk ignore ROLLBACK in this case.
81 int tokenIncrement = 0;
82 for (String[] nextStmtStart : registeredStatementStartPhrases) {
83 boolean matches = true;
84
85 for (int j = 0; j < nextStmtStart.length; j++) {
86 if (matches) {
87 matches = nextStmtStart[j].equalsIgnoreCase(tokensArray[i + j].value())
88 || nextStmtStart[j].equals(ANY_VALUE);
89 }
90 }
91 if (matches) {
92 isStatementStart = true;
93 tokenIncrement = nextStmtStart.length - 1;
94 break;
95 }
96 }
97 if (isStatementStart) {
98 retypedToken = retypedToken.withType(DdlTokenizer.STATEMENT_KEY);
99 }
100 reTypedTokens.add(retypedToken);
101
102 if (isStatementStart) {
103 // Copy any additional tokens used in the phrase
104 for (int k = 0; k < tokenIncrement; k++) {
105 i++;
106 reTypedTokens.add(tokensArray[i]);
107 }
108 }
109 } else {
110 reTypedTokens.add(tokensArray[i]);
111 }
112
113 }
114
115 return reTypedTokens;
116 }
117
118 /**
119 * @param content
120 * @param tokenizer
121 * @param caseSensitive
122 */
123 public DdlTokenStream( String content,
124 Tokenizer tokenizer,
125 boolean caseSensitive ) {
126 super(content, tokenizer, caseSensitive);
127 }
128
129 /**
130 * Register a phrase representing the start of a DDL statement
131 * <p>
132 * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
133 * </p>
134 * see {@link DdlConstants} for the default SQL 92 representations.
135 *
136 * @param phrase
137 */
138 public void registerStatementStartPhrase( String[] phrase ) {
139 registeredStatementStartPhrases.add(phrase);
140 }
141
142 public void registerStatementStartPhrase( String[][] phrases ) {
143 for (String[] phrase : phrases) {
144 registeredStatementStartPhrases.add(phrase);
145 }
146 }
147
148 /**
149 * Register a single key word.
150 *
151 * @param keyWord
152 */
153 public void registerKeyWord( String keyWord ) {
154 registeredKeyWords.add(keyWord);
155 }
156
157 /**
158 * Register an {@link List} of key words.
159 *
160 * @param keyWords
161 */
162 public void registerKeyWords( List<String> keyWords ) {
163 registeredKeyWords.addAll(keyWords);
164 }
165
166 /**
167 * Register an array of key words.
168 *
169 * @param keyWords
170 */
171 public void registerKeyWords( String[] keyWords ) {
172 registeredKeyWords.addAll(Arrays.asList(keyWords));
173 }
174
175 /**
176 * @param word
177 * @return is Key Word
178 */
179 protected boolean isKeyWord( String word ) {
180 return registeredKeyWords.contains(word.toUpperCase());
181 }
182
183 /**
184 * Method to determine if the next token is of type {@link DdlTokenizer} KEYWORD.
185 *
186 * @return is Key Word
187 */
188 public boolean isNextKeyWord() {
189 return this.matches(DdlTokenizer.KEYWORD);
190 }
191
192 /**
193 * Method to determine if next tokens match a registered statement start phrase.
194 *
195 * @return true if next tokens match a registered statement start phrase
196 */
197 public boolean isNextStatementStart() {
198 boolean result = false;
199
200 if (isNextKeyWord()) {
201 for (String[] nextStmtStart : registeredStatementStartPhrases) {
202 if (this.matches(nextStmtStart)) {
203 return true;
204 }
205 }
206 }
207
208 return result;
209 }
210
211 /**
212 * Marks the current position (line & column number) of the currentToken
213 */
214 public void mark() {
215 if (this.hasNext()) {
216 currentMarkedPosition = this.nextPosition();
217 } else {
218 currentMarkedPosition = null;
219 }
220
221 }
222
223 /**
224 * Returns the string content for characters bounded by the previous marked position and the position of the currentToken
225 * (inclusive). Method also marks() the new position the the currentToken.
226 *
227 * @return the string content for characters bounded by the previous marked position and the position of the currentToken
228 * (inclusive).
229 */
230 public String getMarkedContent() {
231 Position startPosition = new Position(currentMarkedPosition.getIndexInContent(), currentMarkedPosition.getLine(),
232 currentMarkedPosition.getColumn());
233
234 mark();
235
236 return getContentBetween(startPosition, currentMarkedPosition);
237 }
238
239 /**
240 * Obtain a ddl {@link DdlTokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the
241 * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
242 * <p>
243 * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for
244 * those situations that happen to be able to use it.
245 * </p>
246 *
247 * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments
248 * should be stripped and not included in the token stream
249 * @return the tokenizer; never null
250 */
251 public static DdlTokenizer ddlTokenizer( boolean includeComments ) {
252 return new DdlTokenizer(includeComments);
253 }
254
255 public static class DdlTokenizer implements Tokenizer {
256 public static final String PARSER_ID = "PARSER_ID";
257
258 /**
259 * The {@link TokenStream.Token#type() token type} for tokens that represent an unquoted string containing a character
260 * sequence made up of non-whitespace and non-symbol characters.
261 */
262 public static final int WORD = 1;
263 /**
264 * The {@link TokenStream.Token#type() token type} for tokens that consist of an individual "symbol" character. The set of
265 * characters includes: <code>-(){}*,;+%?$[]!<>|=:</code>
266 */
267 public static final int SYMBOL = 2;
268 /**
269 * The {@link TokenStream.Token#type() token type} for tokens that consist of an individual '.' character.
270 */
271 public static final int DECIMAL = 4;
272 /**
273 * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters within single-quotes.
274 * Single quote characters are included if they are preceded (escaped) by a '\' character.
275 */
276 public static final int SINGLE_QUOTED_STRING = 8;
277 /**
278 * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters within double-quotes.
279 * Double quote characters are included if they are preceded (escaped) by a '\' character.
280 */
281 public static final int DOUBLE_QUOTED_STRING = 16;
282 /**
283 * The {@link TokenStream.Token#type() token type} for tokens that consist of all the characters between "/*" and
284 * "*/", between "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"), or between "--" and the next line
285 * terminator (e.g., '\n', '\r' or "\r\n").
286 */
287 public static final int COMMENT = 32;
288
289 private final boolean useComments;
290
291 /**
292 * The {@link TokenStream.Token#type() token type} for tokens that represent key words or reserved words for a given DDL
293 * dialect.
294 * <p>
295 * Examples would be: "CREATE", "TABLE", "ALTER", "SCHEMA", "DROP", etc...
296 * </p>
297 * see {@link DdlConstants} for the default SQL 92 representations.
298 */
299 public static final int KEYWORD = 64;
300
301 /**
302 * The {@link TokenStream.Token#type() token type} for tokens that represent the start of a DDL statement.
303 * <p>
304 * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
305 * </p>
306 * see {@link DdlConstants} for the default SQL 92 representations.
307 */
308 public static final int STATEMENT_KEY = 128;
309
310 public DdlTokenizer( boolean useComments ) {
311 this.useComments = useComments;
312 }
313
314 /**
315 * @return useComments
316 */
317 public boolean includeComments() {
318 return useComments;
319 }
320
321 /**
322 * {@inheritDoc}
323 *
324 * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens)
325 */
326 public void tokenize( CharacterStream input,
327 Tokens tokens ) throws ParsingException {
328 int startIndex;
329 int endIndex;
330 while (input.hasNext()) {
331 char c = input.next();
332 switch (c) {
333 case ' ':
334 case '\t':
335 case '\n':
336 case '\r':
337 // Just skip these whitespace characters ...
338 break;
339 // ==============================================================================================
340 // DDL Comments token = "--"
341 // ==============================================================================================
342 case '-': {
343 startIndex = input.index();
344 Position startPosition = input.position(startIndex);
345 if (input.isNext('-')) {
346 // -- END OF LINE comment ...
347 boolean foundLineTerminator = false;
348 while (input.hasNext()) {
349 c = input.next();
350 if (c == '\n' || c == '\r') {
351 foundLineTerminator = true;
352 break;
353 }
354 }
355 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
356 if (!foundLineTerminator) ++endIndex; // must point beyond last char
357 if (c == '\r' && input.isNext('\n')) input.next();
358
359 // Check for PARSER_ID
360
361 if (useComments) {
362 tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
363 }
364
365 } else {
366 // just a regular dash ...
367 tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
368 }
369 break;
370 }
371 // ==============================================================================================
372 case '(':
373 case ')':
374 case '{':
375 case '}':
376 case '*':
377 case ',':
378 case ';':
379 case '+':
380 case '%':
381 case '?':
382 case '$':
383 case '[':
384 case ']':
385 case '!':
386 case '<':
387 case '>':
388 case '|':
389 case '=':
390 case ':':
391 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
392 break;
393 case '.':
394 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
395 break;
396 case '\"':
397 startIndex = input.index();
398 Position startingPosition = input.position(startIndex);
399 boolean foundClosingQuote = false;
400 while (input.hasNext()) {
401 c = input.next();
402 if (c == '\\' && input.isNext('"')) {
403 c = input.next(); // consume the ' character since it is escaped
404 } else if (c == '"') {
405 foundClosingQuote = true;
406 break;
407 }
408 }
409 if (!foundClosingQuote) {
410 String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
411 startingPosition.getColumn());
412 throw new ParsingException(startingPosition, msg);
413 }
414 endIndex = input.index() + 1; // beyond last character read
415 tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
416 break;
417 case '\u2019': // '’':
418 case '\'':
419 char quoteChar = c;
420 startIndex = input.index();
421 startingPosition = input.position(startIndex);
422 foundClosingQuote = false;
423 while (input.hasNext()) {
424 c = input.next();
425 if (c == '\\' && input.isNext(quoteChar)) {
426 c = input.next(); // consume the ' character since it is escaped
427 } else if (c == quoteChar) {
428 foundClosingQuote = true;
429 break;
430 }
431 }
432 if (!foundClosingQuote) {
433 String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
434 startingPosition.getColumn());
435 throw new ParsingException(startingPosition, msg);
436 }
437 endIndex = input.index() + 1; // beyond last character read
438 tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
439 break;
440 case '/':
441 startIndex = input.index();
442 startingPosition = input.position(startIndex);
443 if (input.isNext('/')) {
444 // End-of-line comment ...
445 boolean foundLineTerminator = false;
446 while (input.hasNext()) {
447 c = input.next();
448 if (c == '\n' || c == '\r') {
449 foundLineTerminator = true;
450 break;
451 }
452 }
453 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
454 if (!foundLineTerminator) ++endIndex; // must point beyond last char
455 if (c == '\r' && input.isNext('\n')) input.next();
456 if (useComments) {
457 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
458 }
459
460 } else if (input.isNext('*')) {
461 // Multi-line comment ...
462 while (input.hasNext() && !input.isNext('*', '/')) {
463 c = input.next();
464 }
465 if (input.hasNext()) input.next(); // consume the '*'
466 if (input.hasNext()) input.next(); // consume the '/'
467
468 endIndex = input.index() + 1; // the token will include the '/' and '*' characters
469 if (useComments) {
470 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
471 }
472
473 } else {
474 // just a regular slash ...
475 tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
476 }
477 break;
478 default:
479 startIndex = input.index();
480 Position startPosition = input.position(startIndex);
481 // Read until another whitespace/symbol/decimal/slash is found
482 while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) {
483 c = input.next();
484 }
485 endIndex = input.index() + 1; // beyond last character that was included
486 tokens.addToken(startPosition, startIndex, endIndex, WORD);
487 }
488 }
489 }
490 }
491 }