View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6    * See the AUTHORS.txt file in the distribution for a full listing of 
7    * individual contributors.
8    *
9    * Unless otherwise indicated, all code in ModeShape is licensed
10   * to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   * 
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  package org.modeshape.common.xml;
25  
26  import java.text.CharacterIterator;
27  import java.text.StringCharacterIterator;
28  import net.jcip.annotations.Immutable;
29  
30  /**
31   * A utility class for determining the validity of various XML names, per the <a href="http://www.w3.org/TR/REC-xml/">XML 1.0
32   * Specification</a>.
33   */
34  @Immutable
35  public class XmlCharacters {
36  
37      private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
38  
39      /**
40       * This implementation uses an array that captures for each character the XML classifications. An array is used because it is
41       * a fast way of looking up each character.
42       */
43      private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
44  
45      private static final int VALID_CHARACTER = 1;
46      private static final int CONTENT_CHARACTER = 1 << 1;
47      private static final int SPACE_CHARACTER = 1 << 2;
48      private static final int NAME_START_CHARACTER = 1 << 3;
49      private static final int NAME_CHARACTER = 1 << 4;
50      private static final int NCNAME_START_CHARACTER = 1 << 5;
51      private static final int NCNAME_CHARACTER = 1 << 6;
52      private static final int PUBID_CHARACTER = 1 << 7;
53  
54      static {
55  
56          // ----------------
57          // Valid Characters
58          // ----------------
59          // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
60          // See http://www.w3.org/TR/REC-xml/#charsets
61          MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
62          MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
63          MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
64          for (int i = 0x20; i <= 0xD7FF; ++i)
65              MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
66          for (int i = 0xE000; i <= 0xFFFD; ++i)
67              MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
68          // Last range is bigger than our character array, so we'll handle in the 'isValid' method ...
69          // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
70  
71          // Remove the other characters that are not allowed in XML content:
72          // '<', '&', '\n', '\r', ']'
73          MASKS['<'] &= ~(CONTENT_CHARACTER);
74          MASKS['&'] &= ~(CONTENT_CHARACTER);
75          MASKS['\n'] &= ~(CONTENT_CHARACTER);
76          MASKS['\r'] &= ~(CONTENT_CHARACTER);
77          MASKS[']'] &= ~(CONTENT_CHARACTER);
78  
79          // ---------------------
80          // Whitespace Characters
81          // ---------------------
82          // [3] S ::= (#x20 | #x9 | #xD | #xA)+
83          // See http://www.w3.org/TR/REC-xml/#sec-common-syn
84          MASKS[0x20] |= SPACE_CHARACTER;
85          MASKS[0x9] |= SPACE_CHARACTER;
86          MASKS[0xA] |= SPACE_CHARACTER;
87          MASKS[0xD] |= SPACE_CHARACTER;
88  
89          // ---------------------
90          // Name Start Characters
91          // ---------------------
92          // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
93          // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
94          // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
95          // [#x10000-#xEFFFF]
96          // See http://www.w3.org/TR/REC-xml/#sec-common-syn
97          //
98          // Note that all these start characters AND characters are valid for NAME and NCNAME
99          int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
100         MASKS[':'] |= nameStartMask;
101         MASKS['_'] |= nameStartMask;
102         for (int i = 'A'; i <= 'Z'; ++i)
103             MASKS[i] |= nameStartMask;
104         for (int i = 'a'; i <= 'z'; ++i)
105             MASKS[i] |= nameStartMask;
106         for (int i = 0xC0; i <= 0xD6; ++i)
107             MASKS[i] |= nameStartMask;
108         for (int i = 0xD8; i <= 0xF6; ++i)
109             MASKS[i] |= nameStartMask;
110         for (int i = 0xF8; i <= 0x2FF; ++i)
111             MASKS[i] |= nameStartMask;
112         for (int i = 0x370; i <= 0x37D; ++i)
113             MASKS[i] |= nameStartMask;
114         for (int i = 0x37F; i <= 0x1FFF; ++i)
115             MASKS[i] |= nameStartMask;
116         for (int i = 0x200C; i <= 0x200D; ++i)
117             MASKS[i] |= nameStartMask;
118         for (int i = 0x2070; i <= 0x218F; ++i)
119             MASKS[i] |= nameStartMask;
120         for (int i = 0x2C00; i <= 0x2FEF; ++i)
121             MASKS[i] |= nameStartMask;
122         for (int i = 0x3001; i <= 0xD7FF; ++i)
123             MASKS[i] |= nameStartMask;
124         for (int i = 0xF900; i <= 0xFDCF; ++i)
125             MASKS[i] |= nameStartMask;
126         for (int i = 0xFDF0; i <= 0xFFFD; ++i)
127             MASKS[i] |= nameStartMask;
128         // Last range is bigger than our character array ...
129         // for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
130 
131         // ---------------
132         // Name Characters
133         // ---------------
134         // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
135         // See http://www.w3.org/TR/REC-xml/#sec-common-syn
136         //
137         // Note that all these characters are valid for NAME and NCNAME
138         int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
139         MASKS['-'] |= nameMask;
140         MASKS['.'] |= nameMask;
141         MASKS[0xB7] |= nameMask;
142         for (int i = '0'; i <= '9'; ++i)
143             MASKS[i] |= nameMask;
144         for (int i = 0x0300; i <= 0x036F; ++i)
145             MASKS[i] |= nameStartMask;
146         for (int i = 0x203F; i <= 0x2040; ++i)
147             MASKS[i] |= nameStartMask;
148 
149         // --------
150         // NC Names
151         // --------
152         // [4] NCName ::= NCNameStartChar NCNameChar*
153         // which is just an XML Name, minus the ":"
154         // See http://www.w3.org/TR/REC-xml-names/#ns-decl
155         // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
156         MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
157 
158         // --------------------
159         // Public ID characters
160         // --------------------
161         // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
162         MASKS[0x20] |= PUBID_CHARACTER;
163         MASKS[0xA] |= PUBID_CHARACTER;
164         MASKS[0xD] |= PUBID_CHARACTER;
165         for (int i = 'A'; i <= 'Z'; ++i)
166             MASKS[i] |= PUBID_CHARACTER;
167         for (int i = 'a'; i <= 'z'; ++i)
168             MASKS[i] |= PUBID_CHARACTER;
169         for (int i = '0'; i <= '9'; ++i)
170             MASKS[i] |= PUBID_CHARACTER;
171         MASKS['-'] |= PUBID_CHARACTER;
172         MASKS['\''] |= PUBID_CHARACTER;
173         MASKS['('] |= PUBID_CHARACTER;
174         MASKS[')'] |= PUBID_CHARACTER;
175         MASKS['+'] |= PUBID_CHARACTER;
176         MASKS[','] |= PUBID_CHARACTER;
177         MASKS['.'] |= PUBID_CHARACTER;
178         MASKS['/'] |= PUBID_CHARACTER;
179         MASKS[':'] |= PUBID_CHARACTER;
180         MASKS['='] |= PUBID_CHARACTER;
181         MASKS['?'] |= PUBID_CHARACTER;
182         MASKS[';'] |= PUBID_CHARACTER;
183         MASKS['!'] |= PUBID_CHARACTER;
184         MASKS['*'] |= PUBID_CHARACTER;
185         MASKS['#'] |= PUBID_CHARACTER;
186         MASKS['@'] |= PUBID_CHARACTER;
187         MASKS['$'] |= PUBID_CHARACTER;
188         MASKS['_'] |= PUBID_CHARACTER;
189         MASKS['%'] |= PUBID_CHARACTER;
190 
191     }
192 
193     private XmlCharacters() {
194     }
195 
196     /**
197      * Determine whether the supplied character is a valid first character in an XML Name. The first character in an XML name is
198      * more restrictive than the {@link #isValidName(int) remaining characters}.
199      * 
200      * @param c the character
201      * @return true if the character is valid for an XML Name's first character
202      */
203     public static boolean isValidNameStart( int c ) {
204         return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_START_CHARACTER) != 0;
205     }
206 
207     /**
208      * Determine whether the supplied character is a valid first character in an XML NCName. The first character in an XML NCName
209      * is more restrictive than the {@link #isValidName(int) remaining characters}.
210      * 
211      * @param c the character
212      * @return true if the character is valid for an XML NCName's first character
213      */
214     public static boolean isValidNcNameStart( int c ) {
215         return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_START_CHARACTER) != 0;
216     }
217 
218     /**
219      * Determine whether the supplied character is a valid non-first character in an XML Name. The {@link #isValidNameStart(int)
220      * first character} in an XML name is more restrictive than the remaining characters.
221      * 
222      * @param c the character
223      * @return true if the character is valid character in an XML Name
224      */
225     public static boolean isValidName( int c ) {
226         return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_CHARACTER) != 0;
227     }
228 
229     /**
230      * Determine whether the supplied character is a valid non-first character in an XML NCName. The
231      * {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
232      * 
233      * @param c the character
234      * @return true if the character is valid character in an XML NCName
235      */
236     public static boolean isValidNcName( int c ) {
237         return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_CHARACTER) != 0;
238     }
239 
240     /**
241      * Determine whether the supplied character is a valid character in an XML Pubid.
242      * 
243      * @param c the character
244      * @return true if the character is valid character in an XML Pubid
245      */
246     public static boolean isValidPubid( int c ) {
247         return c < NUMBER_OF_CHARACTERS && (MASKS[c] & PUBID_CHARACTER) != 0;
248     }
249 
250     /**
251      * Determine whether the supplied character is a valid character in XML.
252      * 
253      * @param c the character
254      * @return true if the character is valid character in XML
255      */
256     public static boolean isValid( int c ) {
257         return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & VALID_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
258     }
259 
260     /**
261      * Determine whether the supplied character is a valid character in XML content
262      * 
263      * @param c the character
264      * @return true if the character is valid character in XML content
265      */
266     public static boolean isValidContent( int c ) {
267         return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & CONTENT_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
268     }
269 
270     /**
271      * Determine whether the supplied character is a valid whitespace character in XML
272      * 
273      * @param c the character
274      * @return true if the character is valid whitespace character in XML
275      */
276     public static boolean isValidSpace( int c ) {
277         return c <= 0x20 && (MASKS[c] & SPACE_CHARACTER) != 0;
278     }
279 
280     /**
281      * Determine if the supplied name is a valid XML Name.
282      * 
283      * @param name the string being checked
284      * @return true if the supplied name is indeed a valid XML Name, or false otherwise
285      */
286     public static boolean isValidName( String name ) {
287         if (name == null || name.length() == 0) return false;
288         CharacterIterator iter = new StringCharacterIterator(name);
289         char c = iter.first();
290         if (!isValidNameStart(c)) return false;
291         while (c != CharacterIterator.DONE) {
292             if (!isValidName(c)) return false;
293             c = iter.next();
294         }
295         return true;
296     }
297 
298     /**
299      * Determine if the supplied name is a valid XML NCName.
300      * 
301      * @param name the string being checked
302      * @return true if the supplied name is indeed a valid XML NCName, or false otherwise
303      */
304     public static boolean isValidNcName( String name ) {
305         if (name == null || name.length() == 0) return false;
306         CharacterIterator iter = new StringCharacterIterator(name);
307         char c = iter.first();
308         if (!isValidNcNameStart(c)) return false;
309         while (c != CharacterIterator.DONE) {
310             if (!isValidNcName(c)) return false;
311             c = iter.next();
312         }
313         return true;
314     }
315 }