1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * Unless otherwise indicated, all code in ModeShape is licensed
10 * to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24 package org.modeshape.common.xml;
25
26 import java.text.CharacterIterator;
27 import java.text.StringCharacterIterator;
28 import net.jcip.annotations.Immutable;
29
30 /**
31 * A utility class for determining the validity of various XML names, per the <a href="http://www.w3.org/TR/REC-xml/">XML 1.0
32 * Specification</a>.
33 */
34 @Immutable
35 public class XmlCharacters {
36
37 private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
38
39 /**
40 * This implementation uses an array that captures for each character the XML classifications. An array is used because it is
41 * a fast way of looking up each character.
42 */
43 private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
44
45 private static final int VALID_CHARACTER = 1;
46 private static final int CONTENT_CHARACTER = 1 << 1;
47 private static final int SPACE_CHARACTER = 1 << 2;
48 private static final int NAME_START_CHARACTER = 1 << 3;
49 private static final int NAME_CHARACTER = 1 << 4;
50 private static final int NCNAME_START_CHARACTER = 1 << 5;
51 private static final int NCNAME_CHARACTER = 1 << 6;
52 private static final int PUBID_CHARACTER = 1 << 7;
53
54 static {
55
56 // ----------------
57 // Valid Characters
58 // ----------------
59 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
60 // See http://www.w3.org/TR/REC-xml/#charsets
61 MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
62 MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
63 MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
64 for (int i = 0x20; i <= 0xD7FF; ++i)
65 MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
66 for (int i = 0xE000; i <= 0xFFFD; ++i)
67 MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
68 // Last range is bigger than our character array, so we'll handle in the 'isValid' method ...
69 // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
70
71 // Remove the other characters that are not allowed in XML content:
72 // '<', '&', '\n', '\r', ']'
73 MASKS['<'] &= ~(CONTENT_CHARACTER);
74 MASKS['&'] &= ~(CONTENT_CHARACTER);
75 MASKS['\n'] &= ~(CONTENT_CHARACTER);
76 MASKS['\r'] &= ~(CONTENT_CHARACTER);
77 MASKS[']'] &= ~(CONTENT_CHARACTER);
78
79 // ---------------------
80 // Whitespace Characters
81 // ---------------------
82 // [3] S ::= (#x20 | #x9 | #xD | #xA)+
83 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
84 MASKS[0x20] |= SPACE_CHARACTER;
85 MASKS[0x9] |= SPACE_CHARACTER;
86 MASKS[0xA] |= SPACE_CHARACTER;
87 MASKS[0xD] |= SPACE_CHARACTER;
88
89 // ---------------------
90 // Name Start Characters
91 // ---------------------
92 // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
93 // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
94 // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
95 // [#x10000-#xEFFFF]
96 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
97 //
98 // Note that all these start characters AND characters are valid for NAME and NCNAME
99 int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
100 MASKS[':'] |= nameStartMask;
101 MASKS['_'] |= nameStartMask;
102 for (int i = 'A'; i <= 'Z'; ++i)
103 MASKS[i] |= nameStartMask;
104 for (int i = 'a'; i <= 'z'; ++i)
105 MASKS[i] |= nameStartMask;
106 for (int i = 0xC0; i <= 0xD6; ++i)
107 MASKS[i] |= nameStartMask;
108 for (int i = 0xD8; i <= 0xF6; ++i)
109 MASKS[i] |= nameStartMask;
110 for (int i = 0xF8; i <= 0x2FF; ++i)
111 MASKS[i] |= nameStartMask;
112 for (int i = 0x370; i <= 0x37D; ++i)
113 MASKS[i] |= nameStartMask;
114 for (int i = 0x37F; i <= 0x1FFF; ++i)
115 MASKS[i] |= nameStartMask;
116 for (int i = 0x200C; i <= 0x200D; ++i)
117 MASKS[i] |= nameStartMask;
118 for (int i = 0x2070; i <= 0x218F; ++i)
119 MASKS[i] |= nameStartMask;
120 for (int i = 0x2C00; i <= 0x2FEF; ++i)
121 MASKS[i] |= nameStartMask;
122 for (int i = 0x3001; i <= 0xD7FF; ++i)
123 MASKS[i] |= nameStartMask;
124 for (int i = 0xF900; i <= 0xFDCF; ++i)
125 MASKS[i] |= nameStartMask;
126 for (int i = 0xFDF0; i <= 0xFFFD; ++i)
127 MASKS[i] |= nameStartMask;
128 // Last range is bigger than our character array ...
129 // for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
130
131 // ---------------
132 // Name Characters
133 // ---------------
134 // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
135 // See http://www.w3.org/TR/REC-xml/#sec-common-syn
136 //
137 // Note that all these characters are valid for NAME and NCNAME
138 int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
139 MASKS['-'] |= nameMask;
140 MASKS['.'] |= nameMask;
141 MASKS[0xB7] |= nameMask;
142 for (int i = '0'; i <= '9'; ++i)
143 MASKS[i] |= nameMask;
144 for (int i = 0x0300; i <= 0x036F; ++i)
145 MASKS[i] |= nameStartMask;
146 for (int i = 0x203F; i <= 0x2040; ++i)
147 MASKS[i] |= nameStartMask;
148
149 // --------
150 // NC Names
151 // --------
152 // [4] NCName ::= NCNameStartChar NCNameChar*
153 // which is just an XML Name, minus the ":"
154 // See http://www.w3.org/TR/REC-xml-names/#ns-decl
155 // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
156 MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
157
158 // --------------------
159 // Public ID characters
160 // --------------------
161 // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
162 MASKS[0x20] |= PUBID_CHARACTER;
163 MASKS[0xA] |= PUBID_CHARACTER;
164 MASKS[0xD] |= PUBID_CHARACTER;
165 for (int i = 'A'; i <= 'Z'; ++i)
166 MASKS[i] |= PUBID_CHARACTER;
167 for (int i = 'a'; i <= 'z'; ++i)
168 MASKS[i] |= PUBID_CHARACTER;
169 for (int i = '0'; i <= '9'; ++i)
170 MASKS[i] |= PUBID_CHARACTER;
171 MASKS['-'] |= PUBID_CHARACTER;
172 MASKS['\''] |= PUBID_CHARACTER;
173 MASKS['('] |= PUBID_CHARACTER;
174 MASKS[')'] |= PUBID_CHARACTER;
175 MASKS['+'] |= PUBID_CHARACTER;
176 MASKS[','] |= PUBID_CHARACTER;
177 MASKS['.'] |= PUBID_CHARACTER;
178 MASKS['/'] |= PUBID_CHARACTER;
179 MASKS[':'] |= PUBID_CHARACTER;
180 MASKS['='] |= PUBID_CHARACTER;
181 MASKS['?'] |= PUBID_CHARACTER;
182 MASKS[';'] |= PUBID_CHARACTER;
183 MASKS['!'] |= PUBID_CHARACTER;
184 MASKS['*'] |= PUBID_CHARACTER;
185 MASKS['#'] |= PUBID_CHARACTER;
186 MASKS['@'] |= PUBID_CHARACTER;
187 MASKS['$'] |= PUBID_CHARACTER;
188 MASKS['_'] |= PUBID_CHARACTER;
189 MASKS['%'] |= PUBID_CHARACTER;
190
191 }
192
193 private XmlCharacters() {
194 }
195
196 /**
197 * Determine whether the supplied character is a valid first character in an XML Name. The first character in an XML name is
198 * more restrictive than the {@link #isValidName(int) remaining characters}.
199 *
200 * @param c the character
201 * @return true if the character is valid for an XML Name's first character
202 */
203 public static boolean isValidNameStart( int c ) {
204 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_START_CHARACTER) != 0;
205 }
206
207 /**
208 * Determine whether the supplied character is a valid first character in an XML NCName. The first character in an XML NCName
209 * is more restrictive than the {@link #isValidName(int) remaining characters}.
210 *
211 * @param c the character
212 * @return true if the character is valid for an XML NCName's first character
213 */
214 public static boolean isValidNcNameStart( int c ) {
215 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_START_CHARACTER) != 0;
216 }
217
218 /**
219 * Determine whether the supplied character is a valid non-first character in an XML Name. The {@link #isValidNameStart(int)
220 * first character} in an XML name is more restrictive than the remaining characters.
221 *
222 * @param c the character
223 * @return true if the character is valid character in an XML Name
224 */
225 public static boolean isValidName( int c ) {
226 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_CHARACTER) != 0;
227 }
228
229 /**
230 * Determine whether the supplied character is a valid non-first character in an XML NCName. The
231 * {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
232 *
233 * @param c the character
234 * @return true if the character is valid character in an XML NCName
235 */
236 public static boolean isValidNcName( int c ) {
237 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_CHARACTER) != 0;
238 }
239
240 /**
241 * Determine whether the supplied character is a valid character in an XML Pubid.
242 *
243 * @param c the character
244 * @return true if the character is valid character in an XML Pubid
245 */
246 public static boolean isValidPubid( int c ) {
247 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & PUBID_CHARACTER) != 0;
248 }
249
250 /**
251 * Determine whether the supplied character is a valid character in XML.
252 *
253 * @param c the character
254 * @return true if the character is valid character in XML
255 */
256 public static boolean isValid( int c ) {
257 return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & VALID_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
258 }
259
260 /**
261 * Determine whether the supplied character is a valid character in XML content
262 *
263 * @param c the character
264 * @return true if the character is valid character in XML content
265 */
266 public static boolean isValidContent( int c ) {
267 return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & CONTENT_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
268 }
269
270 /**
271 * Determine whether the supplied character is a valid whitespace character in XML
272 *
273 * @param c the character
274 * @return true if the character is valid whitespace character in XML
275 */
276 public static boolean isValidSpace( int c ) {
277 return c <= 0x20 && (MASKS[c] & SPACE_CHARACTER) != 0;
278 }
279
280 /**
281 * Determine if the supplied name is a valid XML Name.
282 *
283 * @param name the string being checked
284 * @return true if the supplied name is indeed a valid XML Name, or false otherwise
285 */
286 public static boolean isValidName( String name ) {
287 if (name == null || name.length() == 0) return false;
288 CharacterIterator iter = new StringCharacterIterator(name);
289 char c = iter.first();
290 if (!isValidNameStart(c)) return false;
291 while (c != CharacterIterator.DONE) {
292 if (!isValidName(c)) return false;
293 c = iter.next();
294 }
295 return true;
296 }
297
298 /**
299 * Determine if the supplied name is a valid XML NCName.
300 *
301 * @param name the string being checked
302 * @return true if the supplied name is indeed a valid XML NCName, or false otherwise
303 */
304 public static boolean isValidNcName( String name ) {
305 if (name == null || name.length() == 0) return false;
306 CharacterIterator iter = new StringCharacterIterator(name);
307 char c = iter.first();
308 if (!isValidNcNameStart(c)) return false;
309 while (c != CharacterIterator.DONE) {
310 if (!isValidNcName(c)) return false;
311 c = iter.next();
312 }
313 return true;
314 }
315 }