001 /*
002 * JBoss DNA (http://www.jboss.org/dna)
003 * See the COPYRIGHT.txt file distributed with this work for information
004 * regarding copyright ownership. Some portions may be licensed
005 * to Red Hat, Inc. under one or more contributor license agreements.
006 * See the AUTHORS.txt file in the distribution for a full listing of
007 * individual contributors.
008 *
009 * Unless otherwise indicated, all code in JBoss DNA is licensed
010 * to you under the terms of the GNU Lesser General Public License as
011 * published by the Free Software Foundation; either version 2.1 of
012 * the License, or (at your option) any later version.
013 *
014 * JBoss DNA is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017 * Lesser General Public License for more details.
018 *
019 * You should have received a copy of the GNU Lesser General Public
020 * License along with this software; if not, write to the Free
021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023 */
024 package org.jboss.dna.common.xml;
025
026 import java.text.CharacterIterator;
027 import java.text.StringCharacterIterator;
028
029 /**
030 * A utility class for determining the validity of various XML names, per the
031 * <a href="http://www.w3.org/TR/REC-xml/">XML 1.0 Specification</a>.
032 */
033 public class XmlCharacters {
034
035 private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
036
037 /**
038 * This implementation uses an array that captures for each character the XML classifications.
039 * An array is used because it is a fast way of looking up each character.
040 */
041 private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
042
043 private static final int VALID_CHARACTER = 1;
044 private static final int CONTENT_CHARACTER = 1 <<1;
045 private static final int SPACE_CHARACTER = 1 <<2;
046 private static final int NAME_START_CHARACTER = 1<<3;
047 private static final int NAME_CHARACTER = 1<<4;
048 private static final int NCNAME_START_CHARACTER = 1<<5;
049 private static final int NCNAME_CHARACTER = 1<<6;
050 private static final int PUBID_CHARACTER = 1<<7;
051
052 static {
053
054 // ----------------
055 // Valid Characters
056 // ----------------
057 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
058 // See http://www.w3.org/TR/REC-xml/#charsets
059 MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
060 MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
061 MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
062 for (int i = 0x20; i <= 0xD7FF; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
063 for (int i = 0xE000; i <= 0xFFFD; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
064 // Last range is bigger than our character array, so we'll handle in the 'isValid' method ...
065 // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
066
067 // Remove the other characters that are not allowed in XML content:
068 // '<', '&', '\n', '\r', ']'
069 MASKS['<'] &= ~(CONTENT_CHARACTER);
070 MASKS['&'] &= ~(CONTENT_CHARACTER);
071 MASKS['\n'] &= ~(CONTENT_CHARACTER);
072 MASKS['\r'] &= ~(CONTENT_CHARACTER);
073 MASKS[']'] &= ~(CONTENT_CHARACTER);
074
075 // ---------------------
076 // Whitespace Characters
077 // ---------------------
078 // [3] S ::= (#x20 | #x9 | #xD | #xA)+
079 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
080 MASKS[0x20] |= SPACE_CHARACTER;
081 MASKS[0x9] |= SPACE_CHARACTER;
082 MASKS[0xA] |= SPACE_CHARACTER;
083 MASKS[0xD] |= SPACE_CHARACTER;
084
085 // ---------------------
086 // Name Start Characters
087 // ---------------------
088 // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
089 // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
090 // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
091 // [#x10000-#xEFFFF]
092 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
093 //
094 // Note that all these start characters AND characters are valid for NAME and NCNAME
095 int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
096 MASKS[':'] |= nameStartMask;
097 MASKS['_'] |= nameStartMask;
098 for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= nameStartMask;
099 for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= nameStartMask;
100 for (int i = 0xC0; i <= 0xD6; ++i) MASKS[i] |= nameStartMask;
101 for (int i = 0xD8; i <= 0xF6; ++i) MASKS[i] |= nameStartMask;
102 for (int i = 0xF8; i <= 0x2FF; ++i) MASKS[i] |= nameStartMask;
103 for (int i = 0x370; i <= 0x37D; ++i) MASKS[i] |= nameStartMask;
104 for (int i = 0x37F; i <= 0x1FFF; ++i) MASKS[i] |= nameStartMask;
105 for (int i = 0x200C; i <= 0x200D; ++i) MASKS[i] |= nameStartMask;
106 for (int i = 0x2070; i <= 0x218F; ++i) MASKS[i] |= nameStartMask;
107 for (int i = 0x2C00; i <= 0x2FEF; ++i) MASKS[i] |= nameStartMask;
108 for (int i = 0x3001; i <= 0xD7FF; ++i) MASKS[i] |= nameStartMask;
109 for (int i = 0xF900; i <= 0xFDCF; ++i) MASKS[i] |= nameStartMask;
110 for (int i = 0xFDF0; i <= 0xFFFD; ++i) MASKS[i] |= nameStartMask;
111 // Last range is bigger than our character array ...
112 //for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
113
114 // ---------------
115 // Name Characters
116 // ---------------
117 // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
118 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
119 //
120 // Note that all these characters are valid for NAME and NCNAME
121 int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
122 MASKS['-'] |= nameMask;
123 MASKS['.'] |= nameMask;
124 MASKS[0xB7] |= nameMask;
125 for (int i = '0'; i <= '9'; ++i) MASKS[i] |= nameMask;
126 for (int i = 0x0300; i <= 0x036F; ++i) MASKS[i] |= nameStartMask;
127 for (int i = 0x203F; i <= 0x2040; ++i) MASKS[i] |= nameStartMask;
128
129 // --------
130 // NC Names
131 // --------
132 // [4] NCName ::= NCNameStartChar NCNameChar*
133 // which is just an XML Name, minus the ":"
134 // See http://www.w3.org/TR/REC-xml-names/#ns-decl
135 // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
136 MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
137
138 // --------------------
139 // Public ID characters
140 // --------------------
141 // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
142 MASKS[0x20] |= PUBID_CHARACTER;
143 MASKS[0xA] |= PUBID_CHARACTER;
144 MASKS[0xD] |= PUBID_CHARACTER;
145 for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= PUBID_CHARACTER;
146 for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= PUBID_CHARACTER;
147 for (int i = '0'; i <= '9'; ++i) MASKS[i] |= PUBID_CHARACTER;
148 MASKS['-'] |= PUBID_CHARACTER;
149 MASKS['\''] |= PUBID_CHARACTER;
150 MASKS['('] |= PUBID_CHARACTER;
151 MASKS[')'] |= PUBID_CHARACTER;
152 MASKS['+'] |= PUBID_CHARACTER;
153 MASKS[','] |= PUBID_CHARACTER;
154 MASKS['.'] |= PUBID_CHARACTER;
155 MASKS['/'] |= PUBID_CHARACTER;
156 MASKS[':'] |= PUBID_CHARACTER;
157 MASKS['='] |= PUBID_CHARACTER;
158 MASKS['?'] |= PUBID_CHARACTER;
159 MASKS[';'] |= PUBID_CHARACTER;
160 MASKS['!'] |= PUBID_CHARACTER;
161 MASKS['*'] |= PUBID_CHARACTER;
162 MASKS['#'] |= PUBID_CHARACTER;
163 MASKS['@'] |= PUBID_CHARACTER;
164 MASKS['$'] |= PUBID_CHARACTER;
165 MASKS['_'] |= PUBID_CHARACTER;
166 MASKS['%'] |= PUBID_CHARACTER;
167
168 }
169
170 private XmlCharacters() {
171 }
172
173 /**
174 * Determine whether the supplied character is a valid first character in an XML Name.
175 * The first character in an XML name is more restrictive than the {@link #isValidName(int) remaining characters}.
176 *
177 * @param c the character
178 * @return true if the character is valid for an XML Name's first character
179 */
180 public static boolean isValidNameStart( int c ) {
181 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_START_CHARACTER ) != 0;
182 }
183
184 /**
185 * Determine whether the supplied character is a valid first character in an XML NCName.
186 * The first character in an XML NCName is more restrictive than the {@link #isValidName(int) remaining characters}.
187 *
188 * @param c the character
189 * @return true if the character is valid for an XML NCName's first character
190 */
191 public static boolean isValidNcNameStart( int c ) {
192 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_START_CHARACTER ) != 0;
193 }
194
195 /**
196 * Determine whether the supplied character is a valid non-first character in an XML Name.
197 * The {@link #isValidNameStart(int) first character} in an XML name is more restrictive than the remaining characters.
198 *
199 * @param c the character
200 * @return true if the character is valid character in an XML Name
201 */
202 public static boolean isValidName( int c ) {
203 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER ) != 0;
204 }
205
206 /**
207 * Determine whether the supplied character is a valid non-first character in an XML NCName.
208 * The {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
209 *
210 * @param c the character
211 * @return true if the character is valid character in an XML NCName
212 */
213 public static boolean isValidNcName( int c ) {
214 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER ) != 0;
215 }
216
217 /**
218 * Determine whether the supplied character is a valid character in an XML Pubid.
219 *
220 * @param c the character
221 * @return true if the character is valid character in an XML Pubid
222 */
223 public static boolean isValidPubid( int c ) {
224 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER ) != 0;
225 }
226
227 /**
228 * Determine whether the supplied character is a valid character in XML.
229 *
230 * @param c the character
231 * @return true if the character is valid character in XML
232 */
233 public static boolean isValid( int c ) {
234 return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
235 }
236
237 /**
238 * Determine whether the supplied character is a valid character in XML content
239 *
240 * @param c the character
241 * @return true if the character is valid character in XML content
242 */
243 public static boolean isValidContent( int c ) {
244 return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
245 }
246
247 /**
248 * Determine whether the supplied character is a valid whitespace character in XML
249 *
250 * @param c the character
251 * @return true if the character is valid whitespace character in XML
252 */
253 public static boolean isValidSpace( int c ) {
254 return c <= 0x20 && ( MASKS[c] & SPACE_CHARACTER ) != 0;
255 }
256
257 /**
258 * Determine if the supplied name is a valid XML Name.
259 *
260 * @param name the string being checked
261 * @return true if the supplied name is indeed a valid XML Name, or false otherwise
262 */
263 public static boolean isValidName( String name ) {
264 if ( name == null || name.length() == 0 ) return false;
265 CharacterIterator iter = new StringCharacterIterator(name);
266 char c = iter.first();
267 if ( !isValidNameStart(c) ) return false;
268 while ( c != CharacterIterator.DONE ) {
269 if ( !isValidName(c) ) return false;
270 c = iter.next();
271 }
272 return true;
273 }
274
275 /**
276 * Determine if the supplied name is a valid XML NCName.
277 *
278 * @param name the string being checked
279 * @return true if the supplied name is indeed a valid XML NCName, or false otherwise
280 */
281 public static boolean isValidNcName( String name ) {
282 if ( name == null || name.length() == 0 ) return false;
283 CharacterIterator iter = new StringCharacterIterator(name);
284 char c = iter.first();
285 if ( !isValidNcNameStart(c) ) return false;
286 while ( c != CharacterIterator.DONE ) {
287 if ( !isValidNcName(c) ) return false;
288 c = iter.next();
289 }
290 return true;
291 }
292 }