001 /*
002 * JBoss DNA (http://www.jboss.org/dna)
003 * See the COPYRIGHT.txt file distributed with this work for information
004 * regarding copyright ownership. Some portions may be licensed
005 * to Red Hat, Inc. under one or more contributor license agreements.
006 * See the AUTHORS.txt file in the distribution for a full listing of
007 * individual contributors.
008 *
009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010 * is licensed to you under the terms of the GNU Lesser General Public License as
011 * published by the Free Software Foundation; either version 2.1 of
012 * the License, or (at your option) any later version.
013 *
014 * JBoss DNA is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017 * Lesser General Public License for more details.
018 *
019 * You should have received a copy of the GNU Lesser General Public
020 * License along with this software; if not, write to the Free
021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023 */
024 package org.jboss.dna.graph.property;
025
026 import java.io.Serializable;
027 import java.util.regex.Pattern;
028 import java.util.regex.PatternSyntaxException;
029 import net.jcip.annotations.Immutable;
030 import org.jboss.dna.common.util.CheckArg;
031 import org.jboss.dna.common.util.HashCode;
032 import org.jboss.dna.graph.GraphI18n;
033
034 /**
035 * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to
036 * represent node paths or properties.
037 * <p>
038 * Path expressions consist of two parts: a selection criteria (or an input path) and an output path:
039 * </p>
040 *
041 * <pre>
042 * inputPath => outputPath
043 * </pre>
044 * <p>
045 * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of '
046 * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the
047 * same-name-sibling indexes) and '<code>@</code>' signifies a property name.
048 * </p>
049 * <p>
050 * Let's first look at some simple examples:
051 * </p>
052 * <table>
053 * <tr>
054 * <th>Input Path</th>
055 * <th>Description</th>
056 * </tr>
057 * <tr>
058 * <td>/a/b</td>
059 * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any
060 * same-name-sibilings.</td>
061 * </tr>
062 * <tr>
063 * <td>/a/*</td>
064 * <td>Match any child node of the top level node "<code>a</code>".</td>
065 * </tr>
066 * <tr>
067 * <td>/a/*.txt</td>
068 * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td>
069 * </tr>
070 * <tr>
071 * <td>/a/b@c</td>
072 * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td>
073 * </tr>
074 * <tr>
075 * <td>/a/b[2]</td>
076 * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td>
077 * </tr>
078 * <tr>
079 * <td>/a/b[2,3,4]</td>
080 * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td>
081 * </tr>
082 * <tr>
083 * <td>/a/b[*]</td>
084 * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td>
085 * </tr>
086 * <tr>
087 * <td>//a/b</td>
088 * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>"
089 * occurs. Again, neither node may have any same-name-sibilings.</td>
090 * </tr>
091 * </table>
092 * <p>
093 * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard
094 * character that matches any character or sequence of characters in a node's name (or index if appearing in between square
095 * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>").
096 * </p>
097 * <p>
098 * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You
099 * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no
100 * same-name-sibilings, or any positive number to match the specific same-name-sibling.
101 * </p>
102 * <p>
103 * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or
104 * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential
105 * slash characters are treated as two.
106 * </p>
107 * <p>
108 * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more
109 * examples:
110 * </p>
111 * <table>
112 * <tr>
113 * <th>Input Path</th>
114 * <th>Description</th>
115 * </tr>
116 * <tr>
117 * <td>/a/(b|c|d)</td>
118 * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code>
119 * ". None of the nodes may have same-name-sibling indexes.</td>
120 * </tr>
121 * <tr>
122 * <td>/a/b[c/d]</td>
123 * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named "
124 * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code>
125 * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td>
126 * </tr>
127 * <tr>
128 * <td>/a(/(b|c|d|)/e)[f/g/@something]</td>
129 * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code>
130 * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>".
131 * None of the nodes may have same-name-sibling indexes.</td>
132 * </tr>
133 * </table>
134 * <p>
135 * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define
136 * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between
137 * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node "
138 * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the
139 * number used in the output path identifies the parentheses.
140 * </p>
141 * <p>
142 * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square
143 * brackets does not appear in the selected node.
144 * </p>
145 *
146 * @author Randall Hauch
147 */
148 @Immutable
149 public class PathExpression implements Serializable {
150
151 /**
152 * Initial version
153 */
154 private static final long serialVersionUID = 1L;
155
156 /**
157 * Compile the supplied expression and return the resulting path expression instance.
158 *
159 * @param expression the expression
160 * @return the path expression; never null
161 * @throws IllegalArgumentException if the expression is null
162 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
163 */
164 public static final PathExpression compile( String expression ) throws InvalidPathExpressionException {
165 return new PathExpression(expression);
166 }
167
168 private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\]
169 private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING);
170
171 /**
172 * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates
173 * in group 1. Note that some predicates may be valid at the end but not valid elsewhere.
174 * <p>
175 * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties
176 * are allowed only as the last predicate. Predicates with any operators are unused.
177 * </p>
178 * <p>
179 * Nested predicates are not currently allowed.
180 * </p>
181 */
182 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\])
183 private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])";
184 private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING);
185
186 /**
187 * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the
188 * predicates to be removed in group 1.
189 */
190 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\])
191 private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])";
192 private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING);
193
194 private final String expression;
195 private final Pattern matchPattern;
196 private final Pattern selectPattern;
197
198 /**
199 * Create the supplied expression.
200 *
201 * @param expression the expression
202 * @throws IllegalArgumentException if the expression is null
203 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
204 */
205 public PathExpression( String expression ) throws InvalidPathExpressionException {
206 CheckArg.isNotNull(expression, "path expression");
207 this.expression = expression.trim();
208 if (this.expression.length() == 0) {
209 throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text());
210 }
211 // Build the match pattern, which determines whether a path matches the condition ...
212 String matchString = this.expression;
213 try {
214 matchString = removeUnusedPredicates(matchString);
215 matchString = replaceXPathPatterns(matchString);
216 this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE);
217 } catch (PatternSyntaxException e) {
218 String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression);
219 throw new InvalidPathExpressionException(msg, e);
220 }
221 // Build the select pattern, which determines the path that will be selected ...
222 String selectString = this.expression;
223 try {
224 selectString = removeAllPredicatesExceptIndexes(selectString);
225 selectString = replaceXPathPatterns(selectString);
226 selectString = "(" + selectString + ").*"; // group 1 will have selected path ...
227 this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE);
228 } catch (PatternSyntaxException e) {
229 String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression);
230 throw new InvalidPathExpressionException(msg, e);
231 }
232 }
233
234 /**
235 * @return expression
236 */
237 public String getExpression() {
238 return expression;
239 }
240
241 /**
242 * Replace certain XPath patterns that are not used or understood.
243 *
244 * @param expression the input regular expressions string; may not be null
245 * @return the regular expression with all unused XPath patterns removed; never null
246 */
247 protected String removeUnusedPredicates( String expression ) {
248 assert expression != null;
249 java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression);
250 StringBuffer sb = new StringBuffer();
251 if (matcher.find()) {
252 do {
253 // Remove those predicates that show up in group 1 ...
254 String predicateStr = matcher.group(0);
255 String unusablePredicateStr = matcher.group(1);
256 if (unusablePredicateStr != null) {
257 predicateStr = "";
258 }
259 matcher.appendReplacement(sb, predicateStr);
260 } while (matcher.find());
261 matcher.appendTail(sb);
262 expression = sb.toString();
263 }
264 return expression;
265 }
266
267 /**
268 * Remove all XPath predicates from the supplied regular expression string.
269 *
270 * @param expression the input regular expressions string; may not be null
271 * @return the regular expression with all XPath predicates removed; never null
272 */
273 protected String removeAllPredicatesExceptIndexes( String expression ) {
274 assert expression != null;
275 java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression);
276 StringBuffer sb = new StringBuffer();
277 if (matcher.find()) {
278 do {
279 // Remove those predicates that show up in group 1 ...
280 String predicateStr = matcher.group(0);
281 String unusablePredicateStr = matcher.group(1);
282 if (unusablePredicateStr != null) {
283 predicateStr = "";
284 }
285 matcher.appendReplacement(sb, predicateStr);
286 } while (matcher.find());
287 matcher.appendTail(sb);
288 expression = sb.toString();
289 }
290 return expression;
291 }
292
293 /**
294 * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions.
295 *
296 * @param expression the input regular expressions string; may not be null
297 * @return the regular expression with XPath patterns replaced with regular expression fragments; never null
298 */
299 protected String replaceXPathPatterns( String expression ) {
300 assert expression != null;
301 // replace 2 or more sequential '|' characters in an OR expression
302 expression = expression.replaceAll("[\\|]{2,}", "|");
303 // if there is an empty expression in an OR expression, make the whole segment optional ...
304 // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d"
305 expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?");
306 expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?");
307 expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?");
308
309 // // Allow any path (that doesn't contain an explicit counter) to contain a counter,
310 // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'...
311 // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1");
312
313 // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)...
314 // '[*]/' => '(\[\d+\])?/'
315 expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
316 // '[]/' => '(\[\d+\])?/'
317 expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
318 // '[0]/' => '(\[0\])?/'
319 expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional
320 // '[n]/' => '\[n\]/'
321 expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required
322
323 // Change any other end predicates to not be wrapped by braces but to begin with a slash ...
324 // ...'[x]' => ...'/x'
325 expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1");
326
327 // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]'
328 java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression);
329 StringBuffer sb = new StringBuffer();
330 boolean result = matcher.find();
331 if (result) {
332 do {
333 String sequenceStr = matcher.group(1);
334 boolean optional = false;
335 if (sequenceStr.startsWith("0,")) {
336 sequenceStr = sequenceStr.replaceFirst("^0,", "");
337 optional = true;
338 }
339 if (sequenceStr.endsWith(",0")) {
340 sequenceStr = sequenceStr.replaceFirst(",0$", "");
341 optional = true;
342 }
343 if (sequenceStr.contains(",0,")) {
344 sequenceStr = sequenceStr.replaceAll(",0,", ",");
345 optional = true;
346 }
347 sequenceStr = sequenceStr.replaceAll(",", "|");
348 String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]";
349 if (optional) {
350 replacement = "(?:" + replacement + ")?";
351 }
352 matcher.appendReplacement(sb, replacement);
353 result = matcher.find();
354 } while (result);
355 matcher.appendTail(sb);
356 expression = sb.toString();
357 }
358
359 // Order is important here
360 expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '('
361 expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*");
362 expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//'
363 expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//'
364 return expression;
365 }
366
367 /**
368 * @return the expression
369 */
370 public String getSelectExpression() {
371 return this.expression;
372 }
373
374 /**
375 * {@inheritDoc}
376 */
377 @Override
378 public int hashCode() {
379 return this.expression.hashCode();
380 }
381
382 /**
383 * {@inheritDoc}
384 */
385 @Override
386 public boolean equals( Object obj ) {
387 if (obj == this) return true;
388 if (obj instanceof PathExpression) {
389 PathExpression that = (PathExpression)obj;
390 if (!this.expression.equalsIgnoreCase(that.expression)) return false;
391 return true;
392 }
393 return false;
394 }
395
396 /**
397 * {@inheritDoc}
398 */
399 @Override
400 public String toString() {
401 return this.expression;
402 }
403
404 /**
405 * @param absolutePath
406 * @return the matcher
407 */
408 public Matcher matcher( String absolutePath ) {
409 // Determine if the input path match the select expression ...
410 String originalAbsolutePath = absolutePath;
411 // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/";
412 // Remove all trailing '/' ...
413 absolutePath = absolutePath.replaceAll("/+$", "");
414
415 // See if the supplied absolute path matches the pattern ...
416 final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath);
417 if (!matcher.matches()) {
418 // No match, so return immediately ...
419 return new Matcher(matcher, originalAbsolutePath, null);
420 }
421
422 // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ...
423 final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath);
424 if (!selectMatcher.matches()) {
425 // Nothing can be selected, so return immediately ...
426 return new Matcher(matcher, null, null);
427 }
428 // Grab the selected path ...
429 String selectedPath = selectMatcher.group(1);
430
431 // Remove the trailing '/@property' ...
432 selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", "");
433
434 return new Matcher(matcher, originalAbsolutePath, selectedPath);
435 }
436
437 @Immutable
438 public static class Matcher {
439
440 private final String inputPath;
441 private final String selectedPath;
442 private final java.util.regex.Matcher inputMatcher;
443 private final int hc;
444
445 protected Matcher( java.util.regex.Matcher inputMatcher,
446 String inputPath,
447 String selectedPath ) {
448 this.inputMatcher = inputMatcher;
449 this.inputPath = inputPath;
450 this.selectedPath = selectedPath;
451 this.hc = HashCode.compute(this.inputPath, this.selectedPath);
452 }
453
454 public boolean matches() {
455 return this.selectedPath != null;
456 }
457
458 /**
459 * @return inputPath
460 */
461 public String getInputPath() {
462 return this.inputPath;
463 }
464
465 /**
466 * @return selectPattern
467 */
468 public String getSelectedNodePath() {
469 return this.selectedPath;
470 }
471
472 public int groupCount() {
473 return this.inputMatcher.groupCount();
474 }
475
476 public String group( int groupNumber ) {
477 return this.inputMatcher.group(groupNumber);
478 }
479
480 /**
481 * {@inheritDoc}
482 */
483 @Override
484 public int hashCode() {
485 return this.hc;
486 }
487
488 /**
489 * {@inheritDoc}
490 */
491 @Override
492 public boolean equals( Object obj ) {
493 if (obj == this) return true;
494 if (obj instanceof PathExpression.Matcher) {
495 PathExpression.Matcher that = (PathExpression.Matcher)obj;
496 if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false;
497 if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false;
498 return true;
499 }
500 return false;
501 }
502
503 /**
504 * {@inheritDoc}
505 */
506 @Override
507 public String toString() {
508 return this.selectedPath;
509 }
510 }
511
512 /**
513 * Regular expression used to determine if the expression matches any single-level wildcard.
514 */
515 // /*(?:[*.](?:\[\*?\])?/*)*
516 private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*";
517 private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING);
518
519 /**
520 * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes ("
521 * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard ("
522 * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also
523 * considered to match anything.
524 *
525 * @return true if the expression matches anything, or false otherwise
526 */
527 public boolean matchesAnything() {
528 return ANYTHING_PATTERN.matcher(expression).matches();
529 }
530
531 public static PathExpression all() {
532 return ALL_PATHS_EXPRESSION;
533 }
534
535 private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//");
536
537 }