FullTextSearchParser xref

View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6    * See the AUTHORS.txt file in the distribution for a full listing of 
7    * individual contributors.
8    *
9    * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10   * is licensed to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   * 
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  package org.modeshape.graph.query.parse;
25  
26  import java.util.ArrayList;
27  import java.util.List;
28  import org.modeshape.common.CommonI18n;
29  import org.modeshape.common.text.ParsingException;
30  import org.modeshape.common.text.Position;
31  import org.modeshape.common.text.TokenStream;
32  import org.modeshape.common.text.TokenStream.CharacterStream;
33  import org.modeshape.common.text.TokenStream.Token;
34  import org.modeshape.common.text.TokenStream.Tokenizer;
35  import org.modeshape.common.text.TokenStream.Tokens;
36  import org.modeshape.common.util.CheckArg;
37  import org.modeshape.graph.query.model.FullTextSearch.Conjunction;
38  import org.modeshape.graph.query.model.FullTextSearch.Disjunction;
39  import org.modeshape.graph.query.model.FullTextSearch.NegationTerm;
40  import org.modeshape.graph.query.model.FullTextSearch.SimpleTerm;
41  import org.modeshape.graph.query.model.FullTextSearch.Term;
42  
43  /**
44   * A {@link QueryParser} implementation that parses a full-text search expression. This grammar is based on the full-text search
45   * grammar as defined by the JCR 2.0 specification.
46   * <p>
47   * </p>
48   * <h3>Grammar</h3>
49   * <p>
50   * The grammar for the full-text expression is taken from the JCR 2.0 specification, and is as follows:
51   * </p>
52   * 
53   * <pre>
54   * FulltextSearch ::= Disjunct {Space 'OR' Space Disjunct}
55   * Disjunct ::= Term {Space Term}
56   * Term ::= ['-'] SimpleTerm
57   * SimpleTerm ::= Word | '&quot;' Word {Space Word} '&quot;'
58   * Word ::= NonSpaceChar {NonSpaceChar}
59   * Space ::= SpaceChar {SpaceChar}
60   * NonSpaceChar ::= Char - SpaceChar /* Any Char except SpaceChar &#42;/
61   * SpaceChar ::= ' '
62   * Char ::= /* Any character &#42;/
63   * </pre>
64   */
65  public class FullTextSearchParser {
66  
67      /**
68       * Parse the full-text search criteria given in the supplied string.
69       * 
70       * @param fullTextSearchExpression the full-text search expression; may not be null
71       * @return the term representation of the full-text search, or null if there are no terms
72       * @throws ParsingException if there is an error parsing the supplied string
73       * @throws IllegalArgumentException if the expression is null
74       */
75      public Term parse( String fullTextSearchExpression ) {
76          CheckArg.isNotNull(fullTextSearchExpression, "fullTextSearchExpression");
77          Tokenizer tokenizer = new TermTokenizer();
78          TokenStream stream = new TokenStream(fullTextSearchExpression, tokenizer, false);
79          return parse(stream.start());
80      }
81  
82      /**
83       * Parse the full-text search criteria from the supplied token stream. This method is useful when the full-text search
84       * expression is included in other content.
85       * 
86       * @param tokens the token stream containing the full-text search starting on the next token
87       * @return the term representation of the full-text search, or null if there are no terms
88       * @throws ParsingException if there is an error parsing the supplied string
89       * @throws IllegalArgumentException if the token stream is null
90       */
91      public Term parse( TokenStream tokens ) {
92          CheckArg.isNotNull(tokens, "tokens");
93          List<Term> terms = new ArrayList<Term>();
94          do {
95              Term term = parseDisjunctedTerms(tokens);
96              if (term == null) break;
97              terms.add(term);
98          } while (tokens.canConsume("OR"));
99          if (terms.isEmpty()) return null;
100         return terms.size() > 1 ? new Disjunction(terms) : terms.iterator().next();
101     }
102 
103     protected Term parseDisjunctedTerms( TokenStream tokens ) {
104         List<Term> terms = new ArrayList<Term>();
105         do {
106             Term term = parseTerm(tokens);
107             if (term == null) break;
108             terms.add(term);
109         } while (tokens.hasNext() && !tokens.matches("OR"));
110         if (terms.isEmpty()) return null;
111         return terms.size() > 1 ? new Conjunction(terms) : terms.iterator().next();
112     }
113 
114     protected Term parseTerm( TokenStream tokens ) {
115         boolean negated = tokens.canConsume('-');
116         if (!negated) tokens.canConsume('+');
117         Term result = new SimpleTerm(removeQuotes(tokens.consume()));
118         return negated ? new NegationTerm(result) : result;
119     }
120 
121     /**
122      * Remove any leading and trailing single- or double-quotes from the supplied text.
123      * 
124      * @param text the input text; may not be null
125      * @return the text without leading and trailing quotes, or <code>text</code> if there were no quotes
126      */
127     protected String removeQuotes( String text ) {
128         return text.replaceFirst("^['\"]+", "").replaceAll("['\"]+$", "");
129     }
130 
131     /**
132      * A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period
133      * ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
134      * <p>
135      * Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations
136      * that happen to be able to use it.
137      * </p>
138      */
139     public static class TermTokenizer implements Tokenizer {
140         /**
141          * The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made
142          * up of non-whitespace and non-symbol characters.
143          */
144         public static final int WORD = 1;
145         /**
146          * The {@link Token#type() token type} for tokens that consist of an individual '+' or '-' characters. The set of
147          * characters includes: <code>-+</code>
148          */
149         public static final int PLUS_MINUS = 2;
150         /**
151          * The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote
152          * characters are included if they are preceded (escaped) by a '\' character.
153          */
154         public static final int SINGLE_QUOTED_STRING = 4;
155         /**
156          * The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote
157          * characters are included if they are preceded (escaped) by a '\' character.
158          */
159         public static final int DOUBLE_QUOTED_STRING = 8;
160 
161         protected TermTokenizer() {
162         }
163 
164         /**
165          * {@inheritDoc}
166          * 
167          * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens)
168          */
169         public void tokenize( CharacterStream input,
170                               Tokens tokens ) throws ParsingException {
171             while (input.hasNext()) {
172                 char c = input.next();
173                 switch (c) {
174                     case ' ':
175                     case '\t':
176                     case '\n':
177                     case '\r':
178                         // Just skip these whitespace characters ...
179                         break;
180                     case '-':
181                     case '+':
182                         tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, PLUS_MINUS);
183                         break;
184                     case '\"':
185                         int startIndex = input.index();
186                         Position startingPosition = input.position(startIndex);
187                         boolean foundClosingQuote = false;
188                         while (input.hasNext()) {
189                             c = input.next();
190                             if (c == '\\' && input.isNext('"')) {
191                                 c = input.next(); // consume the ' character since it is escaped
192                             } else if (c == '"') {
193                                 foundClosingQuote = true;
194                                 break;
195                             }
196                         }
197                         if (!foundClosingQuote) {
198                             String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
199                                                                                     startingPosition.getColumn());
200                             throw new ParsingException(startingPosition, msg);
201                         }
202                         int endIndex = input.index() + 1; // beyond last character read
203                         tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
204                         break;
205                     case '\'':
206                         startIndex = input.index();
207                         startingPosition = input.position(startIndex);
208                         foundClosingQuote = false;
209                         while (input.hasNext()) {
210                             c = input.next();
211                             if (c == '\\' && input.isNext('\'')) {
212                                 c = input.next(); // consume the ' character since it is escaped
213                             } else if (c == '\'') {
214                                 foundClosingQuote = true;
215                                 break;
216                             }
217                         }
218                         if (!foundClosingQuote) {
219                             String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
220                                                                                     startingPosition.getColumn());
221                             throw new ParsingException(startingPosition, msg);
222                         }
223                         endIndex = input.index() + 1; // beyond last character read
224                         tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
225                         break;
226                     default:
227                         startIndex = input.index();
228                         startingPosition = input.position(startIndex);
229                         // Read until another whitespace is found
230                         while (input.hasNext() && !(input.isNextWhitespace())) {
231                             c = input.next();
232                         }
233                         endIndex = input.index() + 1; // beyond last character that was included
234                         tokens.addToken(startingPosition, startIndex, endIndex, WORD);
235                 }
236             }
237         }
238     }
239 
240 }