001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA 010 * is licensed to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 package org.jboss.dna.graph.property; 025 026 import java.io.Serializable; 027 import java.util.regex.Pattern; 028 import java.util.regex.PatternSyntaxException; 029 import net.jcip.annotations.Immutable; 030 import org.jboss.dna.common.util.CheckArg; 031 import org.jboss.dna.common.util.HashCode; 032 import org.jboss.dna.graph.GraphI18n; 033 034 /** 035 * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to 036 * represent node paths or properties. 037 * <p> 038 * Path expressions consist of two parts: a selection criteria (or an input path) and an output path: 039 * </p> 040 * 041 * <pre> 042 * inputPath => outputPath 043 * </pre> 044 * <p> 045 * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of ' 046 * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the 047 * same-name-sibling indexes) and '<code>@</code>' signifies a property name. 048 * </p> 049 * <p> 050 * Let's first look at some simple examples: 051 * </p> 052 * <table> 053 * <tr> 054 * <th>Input Path</th> 055 * <th>Description</th> 056 * </tr> 057 * <tr> 058 * <td>/a/b</td> 059 * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any 060 * same-name-sibilings.</td> 061 * </tr> 062 * <tr> 063 * <td>/a/*</td> 064 * <td>Match any child node of the top level node "<code>a</code>".</td> 065 * </tr> 066 * <tr> 067 * <td>/a/*.txt</td> 068 * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td> 069 * </tr> 070 * <tr> 071 * <td>/a/b@c</td> 072 * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td> 073 * </tr> 074 * <tr> 075 * <td>/a/b[2]</td> 076 * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td> 077 * </tr> 078 * <tr> 079 * <td>/a/b[2,3,4]</td> 080 * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td> 081 * </tr> 082 * <tr> 083 * <td>/a/b[*]</td> 084 * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td> 085 * </tr> 086 * <tr> 087 * <td>//a/b</td> 088 * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>" 089 * occurs. Again, neither node may have any same-name-sibilings.</td> 090 * </tr> 091 * </table> 092 * <p> 093 * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard 094 * character that matches any character or sequence of characters in a node's name (or index if appearing in between square 095 * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>"). 096 * </p> 097 * <p> 098 * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You 099 * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no 100 * same-name-sibilings, or any positive number to match the specific same-name-sibling. 101 * </p> 102 * <p> 103 * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or 104 * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential 105 * slash characters are treated as two. 106 * </p> 107 * <p> 108 * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more 109 * examples: 110 * </p> 111 * <table> 112 * <tr> 113 * <th>Input Path</th> 114 * <th>Description</th> 115 * </tr> 116 * <tr> 117 * <td>/a/(b|c|d)</td> 118 * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code> 119 * ". None of the nodes may have same-name-sibling indexes.</td> 120 * </tr> 121 * <tr> 122 * <td>/a/b[c/d]</td> 123 * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named " 124 * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code> 125 * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td> 126 * </tr> 127 * <tr> 128 * <td>/a(/(b|c|d|)/e)[f/g/@something]</td> 129 * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code> 130 * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>". 131 * None of the nodes may have same-name-sibling indexes.</td> 132 * </tr> 133 * </table> 134 * <p> 135 * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define 136 * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between 137 * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node " 138 * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the 139 * number used in the output path identifies the parentheses. 140 * </p> 141 * <p> 142 * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square 143 * brackets does not appear in the selected node. 144 * </p> 145 * 146 * @author Randall Hauch 147 */ 148 @Immutable 149 public class PathExpression implements Serializable { 150 151 /** 152 * Initial version 153 */ 154 private static final long serialVersionUID = 1L; 155 156 /** 157 * Compile the supplied expression and return the resulting path expression instance. 158 * 159 * @param expression the expression 160 * @return the path expression; never null 161 * @throws IllegalArgumentException if the expression is null 162 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression 163 */ 164 public static final PathExpression compile( String expression ) throws InvalidPathExpressionException { 165 return new PathExpression(expression); 166 } 167 168 private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\] 169 private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING); 170 171 /** 172 * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates 173 * in group 1. Note that some predicates may be valid at the end but not valid elsewhere. 174 * <p> 175 * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties 176 * are allowed only as the last predicate. Predicates with any operators are unused. 177 * </p> 178 * <p> 179 * Nested predicates are not currently allowed. 180 * </p> 181 */ 182 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\]) 183 private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])"; 184 private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING); 185 186 /** 187 * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the 188 * predicates to be removed in group 1. 189 */ 190 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\]) 191 private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])"; 192 private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING); 193 194 private final String expression; 195 private final Pattern matchPattern; 196 private final Pattern selectPattern; 197 198 /** 199 * Create the supplied expression. 200 * 201 * @param expression the expression 202 * @throws IllegalArgumentException if the expression is null 203 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression 204 */ 205 public PathExpression( String expression ) throws InvalidPathExpressionException { 206 CheckArg.isNotNull(expression, "path expression"); 207 this.expression = expression.trim(); 208 if (this.expression.length() == 0) { 209 throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text()); 210 } 211 // Build the match pattern, which determines whether a path matches the condition ... 212 String matchString = this.expression; 213 try { 214 matchString = removeUnusedPredicates(matchString); 215 matchString = replaceXPathPatterns(matchString); 216 this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE); 217 } catch (PatternSyntaxException e) { 218 String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression); 219 throw new InvalidPathExpressionException(msg, e); 220 } 221 // Build the select pattern, which determines the path that will be selected ... 222 String selectString = this.expression; 223 try { 224 selectString = removeAllPredicatesExceptIndexes(selectString); 225 selectString = replaceXPathPatterns(selectString); 226 selectString = "(" + selectString + ").*"; // group 1 will have selected path ... 227 this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE); 228 } catch (PatternSyntaxException e) { 229 String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression); 230 throw new InvalidPathExpressionException(msg, e); 231 } 232 } 233 234 /** 235 * @return expression 236 */ 237 public String getExpression() { 238 return expression; 239 } 240 241 /** 242 * Replace certain XPath patterns that are not used or understood. 243 * 244 * @param expression the input regular expressions string; may not be null 245 * @return the regular expression with all unused XPath patterns removed; never null 246 */ 247 protected String removeUnusedPredicates( String expression ) { 248 assert expression != null; 249 java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression); 250 StringBuffer sb = new StringBuffer(); 251 if (matcher.find()) { 252 do { 253 // Remove those predicates that show up in group 1 ... 254 String predicateStr = matcher.group(0); 255 String unusablePredicateStr = matcher.group(1); 256 if (unusablePredicateStr != null) { 257 predicateStr = ""; 258 } 259 matcher.appendReplacement(sb, predicateStr); 260 } while (matcher.find()); 261 matcher.appendTail(sb); 262 expression = sb.toString(); 263 } 264 return expression; 265 } 266 267 /** 268 * Remove all XPath predicates from the supplied regular expression string. 269 * 270 * @param expression the input regular expressions string; may not be null 271 * @return the regular expression with all XPath predicates removed; never null 272 */ 273 protected String removeAllPredicatesExceptIndexes( String expression ) { 274 assert expression != null; 275 java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression); 276 StringBuffer sb = new StringBuffer(); 277 if (matcher.find()) { 278 do { 279 // Remove those predicates that show up in group 1 ... 280 String predicateStr = matcher.group(0); 281 String unusablePredicateStr = matcher.group(1); 282 if (unusablePredicateStr != null) { 283 predicateStr = ""; 284 } 285 matcher.appendReplacement(sb, predicateStr); 286 } while (matcher.find()); 287 matcher.appendTail(sb); 288 expression = sb.toString(); 289 } 290 return expression; 291 } 292 293 /** 294 * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions. 295 * 296 * @param expression the input regular expressions string; may not be null 297 * @return the regular expression with XPath patterns replaced with regular expression fragments; never null 298 */ 299 protected String replaceXPathPatterns( String expression ) { 300 assert expression != null; 301 // replace 2 or more sequential '|' characters in an OR expression 302 expression = expression.replaceAll("[\\|]{2,}", "|"); 303 // if there is an empty expression in an OR expression, make the whole segment optional ... 304 // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d" 305 expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?"); 306 expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?"); 307 expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?"); 308 309 // // Allow any path (that doesn't contain an explicit counter) to contain a counter, 310 // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'... 311 // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1"); 312 313 // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)... 314 // '[*]/' => '(\[\d+\])?/' 315 expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional 316 // '[]/' => '(\[\d+\])?/' 317 expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional 318 // '[0]/' => '(\[0\])?/' 319 expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional 320 // '[n]/' => '\[n\]/' 321 expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required 322 323 // Change any other end predicates to not be wrapped by braces but to begin with a slash ... 324 // ...'[x]' => ...'/x' 325 expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1"); 326 327 // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]' 328 java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression); 329 StringBuffer sb = new StringBuffer(); 330 boolean result = matcher.find(); 331 if (result) { 332 do { 333 String sequenceStr = matcher.group(1); 334 boolean optional = false; 335 if (sequenceStr.startsWith("0,")) { 336 sequenceStr = sequenceStr.replaceFirst("^0,", ""); 337 optional = true; 338 } 339 if (sequenceStr.endsWith(",0")) { 340 sequenceStr = sequenceStr.replaceFirst(",0$", ""); 341 optional = true; 342 } 343 if (sequenceStr.contains(",0,")) { 344 sequenceStr = sequenceStr.replaceAll(",0,", ","); 345 optional = true; 346 } 347 sequenceStr = sequenceStr.replaceAll(",", "|"); 348 String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]"; 349 if (optional) { 350 replacement = "(?:" + replacement + ")?"; 351 } 352 matcher.appendReplacement(sb, replacement); 353 result = matcher.find(); 354 } while (result); 355 matcher.appendTail(sb); 356 expression = sb.toString(); 357 } 358 359 // Order is important here 360 expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '(' 361 expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*"); 362 expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//' 363 expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//' 364 return expression; 365 } 366 367 /** 368 * @return the expression 369 */ 370 public String getSelectExpression() { 371 return this.expression; 372 } 373 374 /** 375 * {@inheritDoc} 376 */ 377 @Override 378 public int hashCode() { 379 return this.expression.hashCode(); 380 } 381 382 /** 383 * {@inheritDoc} 384 */ 385 @Override 386 public boolean equals( Object obj ) { 387 if (obj == this) return true; 388 if (obj instanceof PathExpression) { 389 PathExpression that = (PathExpression)obj; 390 if (!this.expression.equalsIgnoreCase(that.expression)) return false; 391 return true; 392 } 393 return false; 394 } 395 396 /** 397 * {@inheritDoc} 398 */ 399 @Override 400 public String toString() { 401 return this.expression; 402 } 403 404 /** 405 * @param absolutePath 406 * @return the matcher 407 */ 408 public Matcher matcher( String absolutePath ) { 409 // Determine if the input path match the select expression ... 410 String originalAbsolutePath = absolutePath; 411 // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/"; 412 // Remove all trailing '/' ... 413 absolutePath = absolutePath.replaceAll("/+$", ""); 414 415 // See if the supplied absolute path matches the pattern ... 416 final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath); 417 if (!matcher.matches()) { 418 // No match, so return immediately ... 419 return new Matcher(matcher, originalAbsolutePath, null); 420 } 421 422 // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ... 423 final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath); 424 if (!selectMatcher.matches()) { 425 // Nothing can be selected, so return immediately ... 426 return new Matcher(matcher, null, null); 427 } 428 // Grab the selected path ... 429 String selectedPath = selectMatcher.group(1); 430 431 // Remove the trailing '/@property' ... 432 selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", ""); 433 434 return new Matcher(matcher, originalAbsolutePath, selectedPath); 435 } 436 437 @Immutable 438 public static class Matcher { 439 440 private final String inputPath; 441 private final String selectedPath; 442 private final java.util.regex.Matcher inputMatcher; 443 private final int hc; 444 445 protected Matcher( java.util.regex.Matcher inputMatcher, 446 String inputPath, 447 String selectedPath ) { 448 this.inputMatcher = inputMatcher; 449 this.inputPath = inputPath; 450 this.selectedPath = selectedPath; 451 this.hc = HashCode.compute(this.inputPath, this.selectedPath); 452 } 453 454 public boolean matches() { 455 return this.selectedPath != null; 456 } 457 458 /** 459 * @return inputPath 460 */ 461 public String getInputPath() { 462 return this.inputPath; 463 } 464 465 /** 466 * @return selectPattern 467 */ 468 public String getSelectedNodePath() { 469 return this.selectedPath; 470 } 471 472 public int groupCount() { 473 return this.inputMatcher.groupCount(); 474 } 475 476 public String group( int groupNumber ) { 477 return this.inputMatcher.group(groupNumber); 478 } 479 480 /** 481 * {@inheritDoc} 482 */ 483 @Override 484 public int hashCode() { 485 return this.hc; 486 } 487 488 /** 489 * {@inheritDoc} 490 */ 491 @Override 492 public boolean equals( Object obj ) { 493 if (obj == this) return true; 494 if (obj instanceof PathExpression.Matcher) { 495 PathExpression.Matcher that = (PathExpression.Matcher)obj; 496 if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false; 497 if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false; 498 return true; 499 } 500 return false; 501 } 502 503 /** 504 * {@inheritDoc} 505 */ 506 @Override 507 public String toString() { 508 return this.selectedPath; 509 } 510 } 511 512 /** 513 * Regular expression used to determine if the expression matches any single-level wildcard. 514 */ 515 // /*(?:[*.](?:\[\*?\])?/*)* 516 private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*"; 517 private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING); 518 519 /** 520 * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes (" 521 * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard (" 522 * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also 523 * considered to match anything. 524 * 525 * @return true if the expression matches anything, or false otherwise 526 */ 527 public boolean matchesAnything() { 528 return ANYTHING_PATTERN.matcher(expression).matches(); 529 } 530 531 public static PathExpression all() { 532 return ALL_PATHS_EXPRESSION; 533 } 534 535 private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//"); 536 537 }