View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6    * See the AUTHORS.txt file in the distribution for a full listing of 
7    * individual contributors. 
8    *
9    * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10   * is licensed to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   *
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  package org.modeshape.graph.property;
25  
26  import java.io.Serializable;
27  import java.util.regex.Pattern;
28  import java.util.regex.PatternSyntaxException;
29  import net.jcip.annotations.Immutable;
30  import org.modeshape.common.util.CheckArg;
31  import org.modeshape.common.util.HashCode;
32  import org.modeshape.graph.GraphI18n;
33  
34  /**
35   * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to
36   * represent node paths or properties.
37   * <p>
38   * Path expressions consist of two parts: a selection criteria (or an input path) and an output path:
39   * </p>
40   * 
41   * <pre>
42   * inputPath =&gt; outputPath
43   * </pre>
44   * <p>
45   * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of '
46   * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the
47   * same-name-sibling indexes) and '<code>@</code>' signifies a property name.
48   * </p>
49   * <p>
50   * Let's first look at some simple examples:
51   * </p>
52   * <table>
53   * <tr>
54   * <th>Input Path</th>
55   * <th>Description</th>
56   * </tr>
57   * <tr>
58   * <td>/a/b</td>
59   * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any
60   * same-name-sibilings.</td>
61   * </tr>
62   * <tr>
63   * <td>/a/*</td>
64   * <td>Match any child node of the top level node "<code>a</code>".</td>
65   * </tr>
66   * <tr>
67   * <td>/a/*.txt</td>
68   * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td>
69   * </tr>
70   * <tr>
71   * <td>/a/b@c</td>
72   * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td>
73   * </tr>
74   * <tr>
75   * <td>/a/b[2]</td>
76   * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td>
77   * </tr>
78   * <tr>
79   * <td>/a/b[2,3,4]</td>
80   * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td>
81   * </tr>
82   * <tr>
83   * <td>/a/b[*]</td>
84   * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td>
85   * </tr>
86   * <tr>
87   * <td>//a/b</td>
88   * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>"
89   * occurs. Again, neither node may have any same-name-sibilings.</td>
90   * </tr>
91   * </table>
92   * <p>
93   * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard
94   * character that matches any character or sequence of characters in a node's name (or index if appearing in between square
95   * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>").
96   * </p>
97   * <p>
98   * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You
99   * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no
100  * same-name-sibilings, or any positive number to match the specific same-name-sibling.
101  * </p>
102  * <p>
103  * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or
104  * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential
105  * slash characters are treated as two.
106  * </p>
107  * <p>
108  * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more
109  * examples:
110  * </p>
111  * <table>
112  * <tr>
113  * <th>Input Path</th>
114  * <th>Description</th>
115  * </tr>
116  * <tr>
117  * <td>/a/(b|c|d)</td>
118  * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code>
119  * ". None of the nodes may have same-name-sibling indexes.</td>
120  * </tr>
121  * <tr>
122  * <td>/a/b[c/d]</td>
123  * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named "
124  * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code>
125  * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td>
126  * </tr>
127  * <tr>
128  * <td>/a(/(b|c|d|)/e)[f/g/@something]</td>
129  * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code>
130  * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>".
131  * None of the nodes may have same-name-sibling indexes.</td>
132  * </tr>
133  * </table>
134  * <p>
135  * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define
136  * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between
137  * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node "
138  * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the
139  * number used in the output path identifies the parentheses.
140  * </p>
141  * <p>
142  * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square
143  * brackets does not appear in the selected node.
144  * </p>
145  */
146 @Immutable
147 public class PathExpression implements Serializable {
148 
149     /**
150      * Initial version
151      */
152     private static final long serialVersionUID = 1L;
153 
154     /**
155      * Compile the supplied expression and return the resulting path expression instance.
156      * 
157      * @param expression the expression
158      * @return the path expression; never null
159      * @throws IllegalArgumentException if the expression is null
160      * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
161      */
162     public static final PathExpression compile( String expression ) throws InvalidPathExpressionException {
163         return new PathExpression(expression);
164     }
165 
166     private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\]
167     private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING);
168 
169     /**
170      * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates
171      * in group 1. Note that some predicates may be valid at the end but not valid elsewhere.
172      * <p>
173      * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties
174      * are allowed only as the last predicate. Predicates with any operators are unused.
175      * </p>
176      * <p>
177      * Nested predicates are not currently allowed.
178      * </p>
179      */
180     // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\])
181     private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])";
182     private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING);
183 
184     /**
185      * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the
186      * predicates to be removed in group 1.
187      */
188     // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\])
189     private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])";
190     private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING);
191 
192     private final String expression;
193     private final Pattern matchPattern;
194     private final Pattern selectPattern;
195 
196     /**
197      * Create the supplied expression.
198      * 
199      * @param expression the expression
200      * @throws IllegalArgumentException if the expression is null
201      * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
202      */
203     public PathExpression( String expression ) throws InvalidPathExpressionException {
204         CheckArg.isNotNull(expression, "path expression");
205         this.expression = expression.trim();
206         if (this.expression.length() == 0) {
207             throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text());
208         }
209         // Build the match pattern, which determines whether a path matches the condition ...
210         String matchString = this.expression;
211         try {
212             matchString = removeUnusedPredicates(matchString);
213             matchString = replaceXPathPatterns(matchString);
214             this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE);
215         } catch (PatternSyntaxException e) {
216             String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression);
217             throw new InvalidPathExpressionException(msg, e);
218         }
219         // Build the select pattern, which determines the path that will be selected ...
220         String selectString = this.expression;
221         try {
222             selectString = removeAllPredicatesExceptIndexes(selectString);
223             selectString = replaceXPathPatterns(selectString);
224             selectString = "(" + selectString + ").*"; // group 1 will have selected path ...
225             this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE);
226         } catch (PatternSyntaxException e) {
227             String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression);
228             throw new InvalidPathExpressionException(msg, e);
229         }
230     }
231 
232     /**
233      * @return expression
234      */
235     public String getExpression() {
236         return expression;
237     }
238 
239     /**
240      * Replace certain XPath patterns that are not used or understood.
241      * 
242      * @param expression the input regular expressions string; may not be null
243      * @return the regular expression with all unused XPath patterns removed; never null
244      */
245     protected String removeUnusedPredicates( String expression ) {
246         assert expression != null;
247         java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression);
248         StringBuffer sb = new StringBuffer();
249         if (matcher.find()) {
250             do {
251                 // Remove those predicates that show up in group 1 ...
252                 String predicateStr = matcher.group(0);
253                 String unusablePredicateStr = matcher.group(1);
254                 if (unusablePredicateStr != null) {
255                     predicateStr = "";
256                 }
257                 matcher.appendReplacement(sb, predicateStr);
258             } while (matcher.find());
259             matcher.appendTail(sb);
260             expression = sb.toString();
261         }
262         return expression;
263     }
264 
265     /**
266      * Remove all XPath predicates from the supplied regular expression string.
267      * 
268      * @param expression the input regular expressions string; may not be null
269      * @return the regular expression with all XPath predicates removed; never null
270      */
271     protected String removeAllPredicatesExceptIndexes( String expression ) {
272         assert expression != null;
273         java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression);
274         StringBuffer sb = new StringBuffer();
275         if (matcher.find()) {
276             do {
277                 // Remove those predicates that show up in group 1 ...
278                 String predicateStr = matcher.group(0);
279                 String unusablePredicateStr = matcher.group(1);
280                 if (unusablePredicateStr != null) {
281                     predicateStr = "";
282                 }
283                 matcher.appendReplacement(sb, predicateStr);
284             } while (matcher.find());
285             matcher.appendTail(sb);
286             expression = sb.toString();
287         }
288         return expression;
289     }
290 
291     /**
292      * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions.
293      * 
294      * @param expression the input regular expressions string; may not be null
295      * @return the regular expression with XPath patterns replaced with regular expression fragments; never null
296      */
297     protected String replaceXPathPatterns( String expression ) {
298         assert expression != null;
299         // replace 2 or more sequential '|' characters in an OR expression
300         expression = expression.replaceAll("[\\|]{2,}", "|");
301         // if there is an empty expression in an OR expression, make the whole segment optional ...
302         // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d"
303         expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?");
304         expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?");
305         expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?");
306 
307         // // Allow any path (that doesn't contain an explicit counter) to contain a counter,
308         // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'...
309         // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1");
310 
311         // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)...
312         // '[*]/' => '(\[\d+\])?/'
313         expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
314         // '[]/' => '(\[\d+\])?/'
315         expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
316         // '[0]/' => '(\[0\])?/'
317         expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional
318         // '[n]/' => '\[n\]/'
319         expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required
320 
321         // Change any other end predicates to not be wrapped by braces but to begin with a slash ...
322         // ...'[x]' => ...'/x'
323         expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1");
324 
325         // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]'
326         java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression);
327         StringBuffer sb = new StringBuffer();
328         boolean result = matcher.find();
329         if (result) {
330             do {
331                 String sequenceStr = matcher.group(1);
332                 boolean optional = false;
333                 if (sequenceStr.startsWith("0,")) {
334                     sequenceStr = sequenceStr.replaceFirst("^0,", "");
335                     optional = true;
336                 }
337                 if (sequenceStr.endsWith(",0")) {
338                     sequenceStr = sequenceStr.replaceFirst(",0$", "");
339                     optional = true;
340                 }
341                 if (sequenceStr.contains(",0,")) {
342                     sequenceStr = sequenceStr.replaceAll(",0,", ",");
343                     optional = true;
344                 }
345                 sequenceStr = sequenceStr.replaceAll(",", "|");
346                 String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]";
347                 if (optional) {
348                     replacement = "(?:" + replacement + ")?";
349                 }
350                 matcher.appendReplacement(sb, replacement);
351                 result = matcher.find();
352             } while (result);
353             matcher.appendTail(sb);
354             expression = sb.toString();
355         }
356 
357         // Order is important here
358         expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '('
359         expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*");
360         expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//'
361         expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//'
362         return expression;
363     }
364 
365     /**
366      * @return the expression
367      */
368     public String getSelectExpression() {
369         return this.expression;
370     }
371 
372     /**
373      * {@inheritDoc}
374      */
375     @Override
376     public int hashCode() {
377         return this.expression.hashCode();
378     }
379 
380     /**
381      * {@inheritDoc}
382      */
383     @Override
384     public boolean equals( Object obj ) {
385         if (obj == this) return true;
386         if (obj instanceof PathExpression) {
387             PathExpression that = (PathExpression)obj;
388             if (!this.expression.equalsIgnoreCase(that.expression)) return false;
389             return true;
390         }
391         return false;
392     }
393 
394     /**
395      * {@inheritDoc}
396      */
397     @Override
398     public String toString() {
399         return this.expression;
400     }
401 
402     /**
403      * @param absolutePath
404      * @return the matcher
405      */
406     public Matcher matcher( String absolutePath ) {
407         // Determine if the input path match the select expression ...
408         String originalAbsolutePath = absolutePath;
409         // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/";
410         // Remove all trailing '/' ...
411         absolutePath = absolutePath.replaceAll("/+$", "");
412 
413         // See if the supplied absolute path matches the pattern ...
414         final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath);
415         if (!matcher.matches()) {
416             // No match, so return immediately ...
417             return new Matcher(matcher, originalAbsolutePath, null);
418         }
419 
420         // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ...
421         final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath);
422         if (!selectMatcher.matches()) {
423             // Nothing can be selected, so return immediately ...
424             return new Matcher(matcher, null, null);
425         }
426         // Grab the selected path ...
427         String selectedPath = selectMatcher.group(1);
428 
429         // Remove the trailing '/@property' ...
430         selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", "");
431 
432         return new Matcher(matcher, originalAbsolutePath, selectedPath);
433     }
434 
435     @Immutable
436     public static class Matcher {
437 
438         private final String inputPath;
439         private final String selectedPath;
440         private final java.util.regex.Matcher inputMatcher;
441         private final int hc;
442 
443         protected Matcher( java.util.regex.Matcher inputMatcher,
444                            String inputPath,
445                            String selectedPath ) {
446             this.inputMatcher = inputMatcher;
447             this.inputPath = inputPath;
448             this.selectedPath = selectedPath;
449             this.hc = HashCode.compute(this.inputPath, this.selectedPath);
450         }
451 
452         public boolean matches() {
453             return this.selectedPath != null;
454         }
455 
456         /**
457          * @return inputPath
458          */
459         public String getInputPath() {
460             return this.inputPath;
461         }
462 
463         /**
464          * @return selectPattern
465          */
466         public String getSelectedNodePath() {
467             return this.selectedPath;
468         }
469 
470         public int groupCount() {
471             return this.inputMatcher.groupCount();
472         }
473 
474         public String group( int groupNumber ) {
475             return this.inputMatcher.group(groupNumber);
476         }
477 
478         /**
479          * {@inheritDoc}
480          */
481         @Override
482         public int hashCode() {
483             return this.hc;
484         }
485 
486         /**
487          * {@inheritDoc}
488          */
489         @Override
490         public boolean equals( Object obj ) {
491             if (obj == this) return true;
492             if (obj instanceof PathExpression.Matcher) {
493                 PathExpression.Matcher that = (PathExpression.Matcher)obj;
494                 if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false;
495                 if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false;
496                 return true;
497             }
498             return false;
499         }
500 
501         /**
502          * {@inheritDoc}
503          */
504         @Override
505         public String toString() {
506             return this.selectedPath;
507         }
508     }
509 
510     /**
511      * Regular expression used to determine if the expression matches any single-level wildcard.
512      */
513     // /*(?:[*.](?:\[\*?\])?/*)*
514     private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*";
515     private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING);
516 
517     /**
518      * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes ("
519      * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard ("
520      * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also
521      * considered to match anything.
522      * 
523      * @return true if the expression matches anything, or false otherwise
524      */
525     public boolean matchesAnything() {
526         return ANYTHING_PATTERN.matcher(expression).matches();
527     }
528 
529     public static PathExpression all() {
530         return ALL_PATHS_EXPRESSION;
531     }
532 
533     private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//");
534 
535 }