001 /*
002 * JBoss, Home of Professional Open Source.
003 * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004 * as indicated by the @author tags. See the copyright.txt file in the
005 * distribution for a full listing of individual contributors.
006 *
007 * This is free software; you can redistribute it and/or modify it
008 * under the terms of the GNU Lesser General Public License as
009 * published by the Free Software Foundation; either version 2.1 of
010 * the License, or (at your option) any later version.
011 *
012 * This software is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * You should have received a copy of the GNU Lesser General Public
018 * License along with this software; if not, write to the Free
019 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021 */
022 package org.jboss.dna.graph.properties;
023
024 import java.io.Serializable;
025 import java.util.regex.Pattern;
026 import java.util.regex.PatternSyntaxException;
027 import net.jcip.annotations.Immutable;
028 import org.jboss.dna.common.util.CheckArg;
029 import org.jboss.dna.common.util.HashCode;
030 import org.jboss.dna.graph.GraphI18n;
031
032 /**
033 * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to
034 * represent node paths or properties.
035 * <p>
036 * Path expressions consist of two parts: a selection criteria (or an input path) and an output path:
037 * </p>
038 *
039 * <pre>
040 * inputPath => outputPath
041 * </pre>
042 * <p>
043 * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of '
044 * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the
045 * same-name-sibling indexes) and '<code>@</code>' signifies a property name.
046 * </p>
047 * <p>
048 * Let's first look at some simple examples:
049 * </p>
050 * <table>
051 * <tr>
052 * <th>Input Path</th>
053 * <th>Description</th>
054 * </tr>
055 * <tr>
056 * <td>/a/b</td>
057 * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any
058 * same-name-sibilings.</td>
059 * </tr>
060 * <tr>
061 * <td>/a/*</td>
062 * <td>Match any child node of the top level node "<code>a</code>".</td>
063 * </tr>
064 * <tr>
065 * <td>/a/*.txt</td>
066 * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td>
067 * </tr>
068 * <tr>
069 * <td>/a/b@c</td>
070 * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td>
071 * </tr>
072 * <tr>
073 * <td>/a/b[2]</td>
074 * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td>
075 * </tr>
076 * <tr>
077 * <td>/a/b[2,3,4]</td>
078 * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td>
079 * </tr>
080 * <tr>
081 * <td>/a/b[*]</td>
082 * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td>
083 * </tr>
084 * <tr>
085 * <td>//a/b</td>
086 * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>"
087 * occurs. Again, neither node may have any same-name-sibilings.</td>
088 * </tr>
089 * </table>
090 * <p>
091 * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard
092 * character that matches any character or sequence of characters in a node's name (or index if appearing in between square
093 * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>").
094 * </p>
095 * <p>
096 * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You
097 * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no
098 * same-name-sibilings, or any positive number to match the specific same-name-sibling.
099 * </p>
100 * <p>
101 * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or
102 * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential
103 * slash characters are treated as two.
104 * </p>
105 * <p>
106 * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more
107 * examples:
108 * </p>
109 * <table>
110 * <tr>
111 * <th>Input Path</th>
112 * <th>Description</th>
113 * </tr>
114 * <tr>
115 * <td>/a/(b|c|d)</td>
116 * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code>
117 * ". None of the nodes may have same-name-sibling indexes.</td>
118 * </tr>
119 * <tr>
120 * <td>/a/b[c/d]</td>
121 * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named "
122 * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code>
123 * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td>
124 * </tr>
125 * <tr>
126 * <td>/a(/(b|c|d|)/e)[f/g/@something]</td>
127 * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code>
128 * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>".
129 * None of the nodes may have same-name-sibling indexes.</td>
130 * </tr>
131 * </table>
132 * <p>
133 * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define
134 * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between
135 * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node "
136 * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the
137 * number used in the output path identifies the parentheses.
138 * </p>
139 * <p>
140 * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square
141 * brackets does not appear in the selected node.
142 * </p>
143 *
144 * @author Randall Hauch
145 */
146 @Immutable
147 public class PathExpression implements Serializable {
148
149 /**
150 * Initial version
151 */
152 private static final long serialVersionUID = 1L;
153
154 /**
155 * Compile the supplied expression and return the resulting path expression instance.
156 *
157 * @param expression the expression
158 * @return the path expression; never null
159 * @throws IllegalArgumentException if the expression is null
160 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
161 */
162 public static final PathExpression compile( String expression ) throws InvalidPathExpressionException {
163 return new PathExpression(expression);
164 }
165
166 private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\]
167 private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING);
168
169 /**
170 * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates
171 * in group 1. Note that some predicates may be valid at the end but not valid elsewhere.
172 * <p>
173 * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties
174 * are allowed only as the last predicate. Predicates with any operators are unused.
175 * </p>
176 * <p>
177 * Nested predicates are not currently allowed.
178 * </p>
179 */
180 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\])
181 private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])";
182 private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING);
183
184 /**
185 * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the
186 * predicates to be removed in group 1.
187 */
188 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\])
189 private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])";
190 private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING);
191
192 private final String expression;
193 private final Pattern matchPattern;
194 private final Pattern selectPattern;
195
196 /**
197 * Create the supplied expression.
198 *
199 * @param expression the expression
200 * @throws IllegalArgumentException if the expression is null
201 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
202 */
203 public PathExpression( String expression ) throws InvalidPathExpressionException {
204 CheckArg.isNotNull(expression, "path expression");
205 this.expression = expression.trim();
206 if (this.expression.length() == 0) {
207 throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text());
208 }
209 // Build the match pattern, which determines whether a path matches the condition ...
210 String matchString = this.expression;
211 try {
212 matchString = removeUnusedPredicates(matchString);
213 matchString = replaceXPathPatterns(matchString);
214 this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE);
215 } catch (PatternSyntaxException e) {
216 String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression);
217 throw new InvalidPathExpressionException(msg, e);
218 }
219 // Build the select pattern, which determines the path that will be selected ...
220 String selectString = this.expression;
221 try {
222 selectString = removeAllPredicatesExceptIndexes(selectString);
223 selectString = replaceXPathPatterns(selectString);
224 selectString = "(" + selectString + ").*"; // group 1 will have selected path ...
225 this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE);
226 } catch (PatternSyntaxException e) {
227 String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression);
228 throw new InvalidPathExpressionException(msg, e);
229 }
230 }
231
232 /**
233 * @return expression
234 */
235 public String getExpression() {
236 return expression;
237 }
238
239 /**
240 * Replace certain XPath patterns that are not used or understood.
241 *
242 * @param expression the input regular expressions string; may not be null
243 * @return the regular expression with all unused XPath patterns removed; never null
244 */
245 protected String removeUnusedPredicates( String expression ) {
246 assert expression != null;
247 java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression);
248 StringBuffer sb = new StringBuffer();
249 if (matcher.find()) {
250 do {
251 // Remove those predicates that show up in group 1 ...
252 String predicateStr = matcher.group(0);
253 String unusablePredicateStr = matcher.group(1);
254 if (unusablePredicateStr != null) {
255 predicateStr = "";
256 }
257 matcher.appendReplacement(sb, predicateStr);
258 } while (matcher.find());
259 matcher.appendTail(sb);
260 expression = sb.toString();
261 }
262 return expression;
263 }
264
265 /**
266 * Remove all XPath predicates from the supplied regular expression string.
267 *
268 * @param expression the input regular expressions string; may not be null
269 * @return the regular expression with all XPath predicates removed; never null
270 */
271 protected String removeAllPredicatesExceptIndexes( String expression ) {
272 assert expression != null;
273 java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression);
274 StringBuffer sb = new StringBuffer();
275 if (matcher.find()) {
276 do {
277 // Remove those predicates that show up in group 1 ...
278 String predicateStr = matcher.group(0);
279 String unusablePredicateStr = matcher.group(1);
280 if (unusablePredicateStr != null) {
281 predicateStr = "";
282 }
283 matcher.appendReplacement(sb, predicateStr);
284 } while (matcher.find());
285 matcher.appendTail(sb);
286 expression = sb.toString();
287 }
288 return expression;
289 }
290
291 /**
292 * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions.
293 *
294 * @param expression the input regular expressions string; may not be null
295 * @return the regular expression with XPath patterns replaced with regular expression fragments; never null
296 */
297 protected String replaceXPathPatterns( String expression ) {
298 assert expression != null;
299 // replace 2 or more sequential '|' characters in an OR expression
300 expression = expression.replaceAll("[\\|]{2,}", "|");
301 // if there is an empty expression in an OR expression, make the whole segment optional ...
302 // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d"
303 expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?");
304 expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?");
305 expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?");
306
307 // // Allow any path (that doesn't contain an explicit counter) to contain a counter,
308 // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'...
309 // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1");
310
311 // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)...
312 // '[*]/' => '(\[\d+\])?/'
313 expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
314 // '[]/' => '(\[\d+\])?/'
315 expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
316 // '[0]/' => '(\[0\])?/'
317 expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional
318 // '[n]/' => '\[n\]/'
319 expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required
320
321 // Change any other end predicates to not be wrapped by braces but to begin with a slash ...
322 // ...'[x]' => ...'/x'
323 expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1");
324
325 // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]'
326 java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression);
327 StringBuffer sb = new StringBuffer();
328 boolean result = matcher.find();
329 if (result) {
330 do {
331 String sequenceStr = matcher.group(1);
332 boolean optional = false;
333 if (sequenceStr.startsWith("0,")) {
334 sequenceStr = sequenceStr.replaceFirst("^0,", "");
335 optional = true;
336 }
337 if (sequenceStr.endsWith(",0")) {
338 sequenceStr = sequenceStr.replaceFirst(",0$", "");
339 optional = true;
340 }
341 if (sequenceStr.contains(",0,")) {
342 sequenceStr = sequenceStr.replaceAll(",0,", ",");
343 optional = true;
344 }
345 sequenceStr = sequenceStr.replaceAll(",", "|");
346 String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]";
347 if (optional) {
348 replacement = "(?:" + replacement + ")?";
349 }
350 matcher.appendReplacement(sb, replacement);
351 result = matcher.find();
352 } while (result);
353 matcher.appendTail(sb);
354 expression = sb.toString();
355 }
356
357 // Order is important here
358 expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '('
359 expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*");
360 expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//'
361 expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//'
362 return expression;
363 }
364
365 /**
366 * @return the expression
367 */
368 public String getSelectExpression() {
369 return this.expression;
370 }
371
372 /**
373 * {@inheritDoc}
374 */
375 @Override
376 public int hashCode() {
377 return this.expression.hashCode();
378 }
379
380 /**
381 * {@inheritDoc}
382 */
383 @Override
384 public boolean equals( Object obj ) {
385 if (obj == this) return true;
386 if (obj instanceof PathExpression) {
387 PathExpression that = (PathExpression)obj;
388 if (!this.expression.equalsIgnoreCase(that.expression)) return false;
389 return true;
390 }
391 return false;
392 }
393
394 /**
395 * {@inheritDoc}
396 */
397 @Override
398 public String toString() {
399 return this.expression;
400 }
401
402 /**
403 * @param absolutePath
404 * @return the matcher
405 */
406 public Matcher matcher( String absolutePath ) {
407 // Determine if the input path match the select expression ...
408 String originalAbsolutePath = absolutePath;
409 // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/";
410 // Remove all trailing '/' ...
411 absolutePath = absolutePath.replaceAll("/+$", "");
412
413 // See if the supplied absolute path matches the pattern ...
414 final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath);
415 if (!matcher.matches()) {
416 // No match, so return immediately ...
417 return new Matcher(matcher, originalAbsolutePath, null);
418 }
419
420 // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ...
421 final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath);
422 if (!selectMatcher.matches()) {
423 // Nothing can be selected, so return immediately ...
424 return new Matcher(matcher, null, null);
425 }
426 // Grab the selected path ...
427 String selectedPath = selectMatcher.group(1);
428
429 // Remove the trailing '/@property' ...
430 selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", "");
431
432 return new Matcher(matcher, originalAbsolutePath, selectedPath);
433 }
434
435 @Immutable
436 public static class Matcher {
437
438 private final String inputPath;
439 private final String selectedPath;
440 private final java.util.regex.Matcher inputMatcher;
441 private final int hc;
442
443 protected Matcher( java.util.regex.Matcher inputMatcher,
444 String inputPath,
445 String selectedPath ) {
446 this.inputMatcher = inputMatcher;
447 this.inputPath = inputPath;
448 this.selectedPath = selectedPath;
449 this.hc = HashCode.compute(this.inputPath, this.selectedPath);
450 }
451
452 public boolean matches() {
453 return this.selectedPath != null;
454 }
455
456 /**
457 * @return inputPath
458 */
459 public String getInputPath() {
460 return this.inputPath;
461 }
462
463 /**
464 * @return selectPattern
465 */
466 public String getSelectedNodePath() {
467 return this.selectedPath;
468 }
469
470 public int groupCount() {
471 return this.inputMatcher.groupCount();
472 }
473
474 public String group( int groupNumber ) {
475 return this.inputMatcher.group(groupNumber);
476 }
477
478 /**
479 * {@inheritDoc}
480 */
481 @Override
482 public int hashCode() {
483 return this.hc;
484 }
485
486 /**
487 * {@inheritDoc}
488 */
489 @Override
490 public boolean equals( Object obj ) {
491 if (obj == this) return true;
492 if (obj instanceof PathExpression.Matcher) {
493 PathExpression.Matcher that = (PathExpression.Matcher)obj;
494 if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false;
495 if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false;
496 return true;
497 }
498 return false;
499 }
500
501 /**
502 * {@inheritDoc}
503 */
504 @Override
505 public String toString() {
506 return this.selectedPath;
507 }
508 }
509
510 /**
511 * Regular expression used to determine if the expression matches any single-level wildcard.
512 */
513 // /*(?:[*.](?:\[\*?\])?/*)*
514 private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*";
515 private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING);
516
517 /**
518 * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes ("
519 * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard ("
520 * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also
521 * considered to match anything.
522 *
523 * @return true if the expression matches anything, or false otherwise
524 */
525 public boolean matchesAnything() {
526 return ANYTHING_PATTERN.matcher(expression).matches();
527 }
528
529 public static PathExpression all() {
530 return ALL_PATHS_EXPRESSION;
531 }
532
533 private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//");
534
535 }