001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006     * See the AUTHORS.txt file in the distribution for a full listing of 
007     * individual contributors. 
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.graph.property;
025    
026    import java.io.Serializable;
027    import java.util.regex.Pattern;
028    import java.util.regex.PatternSyntaxException;
029    import net.jcip.annotations.Immutable;
030    import org.jboss.dna.common.util.CheckArg;
031    import org.jboss.dna.common.util.HashCode;
032    import org.jboss.dna.graph.GraphI18n;
033    
034    /**
035     * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to
036     * represent node paths or properties.
037     * <p>
038     * Path expressions consist of two parts: a selection criteria (or an input path) and an output path:
039     * </p>
040     * 
041     * <pre>
042     * inputPath =&gt; outputPath
043     * </pre>
044     * <p>
045     * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of '
046     * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the
047     * same-name-sibling indexes) and '<code>@</code>' signifies a property name.
048     * </p>
049     * <p>
050     * Let's first look at some simple examples:
051     * </p>
052     * <table>
053     * <tr>
054     * <th>Input Path</th>
055     * <th>Description</th>
056     * </tr>
057     * <tr>
058     * <td>/a/b</td>
059     * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any
060     * same-name-sibilings.</td>
061     * </tr>
062     * <tr>
063     * <td>/a/*</td>
064     * <td>Match any child node of the top level node "<code>a</code>".</td>
065     * </tr>
066     * <tr>
067     * <td>/a/*.txt</td>
068     * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td>
069     * </tr>
070     * <tr>
071     * <td>/a/b@c</td>
072     * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td>
073     * </tr>
074     * <tr>
075     * <td>/a/b[2]</td>
076     * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td>
077     * </tr>
078     * <tr>
079     * <td>/a/b[2,3,4]</td>
080     * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td>
081     * </tr>
082     * <tr>
083     * <td>/a/b[*]</td>
084     * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td>
085     * </tr>
086     * <tr>
087     * <td>//a/b</td>
088     * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>"
089     * occurs. Again, neither node may have any same-name-sibilings.</td>
090     * </tr>
091     * </table>
092     * <p>
093     * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard
094     * character that matches any character or sequence of characters in a node's name (or index if appearing in between square
095     * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>").
096     * </p>
097     * <p>
098     * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You
099     * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no
100     * same-name-sibilings, or any positive number to match the specific same-name-sibling.
101     * </p>
102     * <p>
103     * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or
104     * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential
105     * slash characters are treated as two.
106     * </p>
107     * <p>
108     * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more
109     * examples:
110     * </p>
111     * <table>
112     * <tr>
113     * <th>Input Path</th>
114     * <th>Description</th>
115     * </tr>
116     * <tr>
117     * <td>/a/(b|c|d)</td>
118     * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code>
119     * ". None of the nodes may have same-name-sibling indexes.</td>
120     * </tr>
121     * <tr>
122     * <td>/a/b[c/d]</td>
123     * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named "
124     * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code>
125     * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td>
126     * </tr>
127     * <tr>
128     * <td>/a(/(b|c|d|)/e)[f/g/@something]</td>
129     * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code>
130     * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>".
131     * None of the nodes may have same-name-sibling indexes.</td>
132     * </tr>
133     * </table>
134     * <p>
135     * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define
136     * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between
137     * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node "
138     * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the
139     * number used in the output path identifies the parentheses.
140     * </p>
141     * <p>
142     * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square
143     * brackets does not appear in the selected node.
144     * </p>
145     * 
146     * @author Randall Hauch
147     */
148    @Immutable
149    public class PathExpression implements Serializable {
150    
151        /**
152         * Initial version
153         */
154        private static final long serialVersionUID = 1L;
155    
156        /**
157         * Compile the supplied expression and return the resulting path expression instance.
158         * 
159         * @param expression the expression
160         * @return the path expression; never null
161         * @throws IllegalArgumentException if the expression is null
162         * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
163         */
164        public static final PathExpression compile( String expression ) throws InvalidPathExpressionException {
165            return new PathExpression(expression);
166        }
167    
168        private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\]
169        private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING);
170    
171        /**
172         * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates
173         * in group 1. Note that some predicates may be valid at the end but not valid elsewhere.
174         * <p>
175         * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties
176         * are allowed only as the last predicate. Predicates with any operators are unused.
177         * </p>
178         * <p>
179         * Nested predicates are not currently allowed.
180         * </p>
181         */
182        // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\])
183        private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])";
184        private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING);
185    
186        /**
187         * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the
188         * predicates to be removed in group 1.
189         */
190        // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\])
191        private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])";
192        private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING);
193    
194        private final String expression;
195        private final Pattern matchPattern;
196        private final Pattern selectPattern;
197    
198        /**
199         * Create the supplied expression.
200         * 
201         * @param expression the expression
202         * @throws IllegalArgumentException if the expression is null
203         * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
204         */
205        public PathExpression( String expression ) throws InvalidPathExpressionException {
206            CheckArg.isNotNull(expression, "path expression");
207            this.expression = expression.trim();
208            if (this.expression.length() == 0) {
209                throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text());
210            }
211            // Build the match pattern, which determines whether a path matches the condition ...
212            String matchString = this.expression;
213            try {
214                matchString = removeUnusedPredicates(matchString);
215                matchString = replaceXPathPatterns(matchString);
216                this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE);
217            } catch (PatternSyntaxException e) {
218                String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression);
219                throw new InvalidPathExpressionException(msg, e);
220            }
221            // Build the select pattern, which determines the path that will be selected ...
222            String selectString = this.expression;
223            try {
224                selectString = removeAllPredicatesExceptIndexes(selectString);
225                selectString = replaceXPathPatterns(selectString);
226                selectString = "(" + selectString + ").*"; // group 1 will have selected path ...
227                this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE);
228            } catch (PatternSyntaxException e) {
229                String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression);
230                throw new InvalidPathExpressionException(msg, e);
231            }
232        }
233    
234        /**
235         * @return expression
236         */
237        public String getExpression() {
238            return expression;
239        }
240    
241        /**
242         * Replace certain XPath patterns that are not used or understood.
243         * 
244         * @param expression the input regular expressions string; may not be null
245         * @return the regular expression with all unused XPath patterns removed; never null
246         */
247        protected String removeUnusedPredicates( String expression ) {
248            assert expression != null;
249            java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression);
250            StringBuffer sb = new StringBuffer();
251            if (matcher.find()) {
252                do {
253                    // Remove those predicates that show up in group 1 ...
254                    String predicateStr = matcher.group(0);
255                    String unusablePredicateStr = matcher.group(1);
256                    if (unusablePredicateStr != null) {
257                        predicateStr = "";
258                    }
259                    matcher.appendReplacement(sb, predicateStr);
260                } while (matcher.find());
261                matcher.appendTail(sb);
262                expression = sb.toString();
263            }
264            return expression;
265        }
266    
267        /**
268         * Remove all XPath predicates from the supplied regular expression string.
269         * 
270         * @param expression the input regular expressions string; may not be null
271         * @return the regular expression with all XPath predicates removed; never null
272         */
273        protected String removeAllPredicatesExceptIndexes( String expression ) {
274            assert expression != null;
275            java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression);
276            StringBuffer sb = new StringBuffer();
277            if (matcher.find()) {
278                do {
279                    // Remove those predicates that show up in group 1 ...
280                    String predicateStr = matcher.group(0);
281                    String unusablePredicateStr = matcher.group(1);
282                    if (unusablePredicateStr != null) {
283                        predicateStr = "";
284                    }
285                    matcher.appendReplacement(sb, predicateStr);
286                } while (matcher.find());
287                matcher.appendTail(sb);
288                expression = sb.toString();
289            }
290            return expression;
291        }
292    
293        /**
294         * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions.
295         * 
296         * @param expression the input regular expressions string; may not be null
297         * @return the regular expression with XPath patterns replaced with regular expression fragments; never null
298         */
299        protected String replaceXPathPatterns( String expression ) {
300            assert expression != null;
301            // replace 2 or more sequential '|' characters in an OR expression
302            expression = expression.replaceAll("[\\|]{2,}", "|");
303            // if there is an empty expression in an OR expression, make the whole segment optional ...
304            // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d"
305            expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?");
306            expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?");
307            expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?");
308    
309            // // Allow any path (that doesn't contain an explicit counter) to contain a counter,
310            // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'...
311            // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1");
312    
313            // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)...
314            // '[*]/' => '(\[\d+\])?/'
315            expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
316            // '[]/' => '(\[\d+\])?/'
317            expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
318            // '[0]/' => '(\[0\])?/'
319            expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional
320            // '[n]/' => '\[n\]/'
321            expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required
322    
323            // Change any other end predicates to not be wrapped by braces but to begin with a slash ...
324            // ...'[x]' => ...'/x'
325            expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1");
326    
327            // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]'
328            java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression);
329            StringBuffer sb = new StringBuffer();
330            boolean result = matcher.find();
331            if (result) {
332                do {
333                    String sequenceStr = matcher.group(1);
334                    boolean optional = false;
335                    if (sequenceStr.startsWith("0,")) {
336                        sequenceStr = sequenceStr.replaceFirst("^0,", "");
337                        optional = true;
338                    }
339                    if (sequenceStr.endsWith(",0")) {
340                        sequenceStr = sequenceStr.replaceFirst(",0$", "");
341                        optional = true;
342                    }
343                    if (sequenceStr.contains(",0,")) {
344                        sequenceStr = sequenceStr.replaceAll(",0,", ",");
345                        optional = true;
346                    }
347                    sequenceStr = sequenceStr.replaceAll(",", "|");
348                    String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]";
349                    if (optional) {
350                        replacement = "(?:" + replacement + ")?";
351                    }
352                    matcher.appendReplacement(sb, replacement);
353                    result = matcher.find();
354                } while (result);
355                matcher.appendTail(sb);
356                expression = sb.toString();
357            }
358    
359            // Order is important here
360            expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '('
361            expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*");
362            expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//'
363            expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//'
364            return expression;
365        }
366    
367        /**
368         * @return the expression
369         */
370        public String getSelectExpression() {
371            return this.expression;
372        }
373    
374        /**
375         * {@inheritDoc}
376         */
377        @Override
378        public int hashCode() {
379            return this.expression.hashCode();
380        }
381    
382        /**
383         * {@inheritDoc}
384         */
385        @Override
386        public boolean equals( Object obj ) {
387            if (obj == this) return true;
388            if (obj instanceof PathExpression) {
389                PathExpression that = (PathExpression)obj;
390                if (!this.expression.equalsIgnoreCase(that.expression)) return false;
391                return true;
392            }
393            return false;
394        }
395    
396        /**
397         * {@inheritDoc}
398         */
399        @Override
400        public String toString() {
401            return this.expression;
402        }
403    
404        /**
405         * @param absolutePath
406         * @return the matcher
407         */
408        public Matcher matcher( String absolutePath ) {
409            // Determine if the input path match the select expression ...
410            String originalAbsolutePath = absolutePath;
411            // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/";
412            // Remove all trailing '/' ...
413            absolutePath = absolutePath.replaceAll("/+$", "");
414    
415            // See if the supplied absolute path matches the pattern ...
416            final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath);
417            if (!matcher.matches()) {
418                // No match, so return immediately ...
419                return new Matcher(matcher, originalAbsolutePath, null);
420            }
421    
422            // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ...
423            final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath);
424            if (!selectMatcher.matches()) {
425                // Nothing can be selected, so return immediately ...
426                return new Matcher(matcher, null, null);
427            }
428            // Grab the selected path ...
429            String selectedPath = selectMatcher.group(1);
430    
431            // Remove the trailing '/@property' ...
432            selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", "");
433    
434            return new Matcher(matcher, originalAbsolutePath, selectedPath);
435        }
436    
437        @Immutable
438        public static class Matcher {
439    
440            private final String inputPath;
441            private final String selectedPath;
442            private final java.util.regex.Matcher inputMatcher;
443            private final int hc;
444    
445            protected Matcher( java.util.regex.Matcher inputMatcher,
446                               String inputPath,
447                               String selectedPath ) {
448                this.inputMatcher = inputMatcher;
449                this.inputPath = inputPath;
450                this.selectedPath = selectedPath;
451                this.hc = HashCode.compute(this.inputPath, this.selectedPath);
452            }
453    
454            public boolean matches() {
455                return this.selectedPath != null;
456            }
457    
458            /**
459             * @return inputPath
460             */
461            public String getInputPath() {
462                return this.inputPath;
463            }
464    
465            /**
466             * @return selectPattern
467             */
468            public String getSelectedNodePath() {
469                return this.selectedPath;
470            }
471    
472            public int groupCount() {
473                return this.inputMatcher.groupCount();
474            }
475    
476            public String group( int groupNumber ) {
477                return this.inputMatcher.group(groupNumber);
478            }
479    
480            /**
481             * {@inheritDoc}
482             */
483            @Override
484            public int hashCode() {
485                return this.hc;
486            }
487    
488            /**
489             * {@inheritDoc}
490             */
491            @Override
492            public boolean equals( Object obj ) {
493                if (obj == this) return true;
494                if (obj instanceof PathExpression.Matcher) {
495                    PathExpression.Matcher that = (PathExpression.Matcher)obj;
496                    if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false;
497                    if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false;
498                    return true;
499                }
500                return false;
501            }
502    
503            /**
504             * {@inheritDoc}
505             */
506            @Override
507            public String toString() {
508                return this.selectedPath;
509            }
510        }
511    
512        /**
513         * Regular expression used to determine if the expression matches any single-level wildcard.
514         */
515        // /*(?:[*.](?:\[\*?\])?/*)*
516        private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*";
517        private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING);
518    
519        /**
520         * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes ("
521         * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard ("
522         * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also
523         * considered to match anything.
524         * 
525         * @return true if the expression matches anything, or false otherwise
526         */
527        public boolean matchesAnything() {
528            return ANYTHING_PATTERN.matcher(expression).matches();
529        }
530    
531        public static PathExpression all() {
532            return ALL_PATHS_EXPRESSION;
533        }
534    
535        private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//");
536    
537    }