001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006     * See the AUTHORS.txt file in the distribution for a full listing of 
007     * individual contributors. 
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.common.text;
025    
026    import java.text.CharacterIterator;
027    import java.text.StringCharacterIterator;
028    import java.util.BitSet;
029    
030    /**
031     * An {@link TextEncoder encoder} and {@link TextDecoder decoder} for XML element and attribute names.
032     * <p>
033     * Any UTF-16 unicode character that is not a valid XML name character according to the <a
034     * href="http://www.w3.org/TR/REC-xml/#sec-common-syn">World Wide Web Consortium (W3C) Extensible Markup Language (XML) 1.0
035     * (Fourth Edition) Recommendation</a> is escaped as <code>_xHHHH_</code>, where <code>HHHH</code> stands for the four-digit
036     * hexadecimal UTF-16 unicode value for the character in the most significant bit first order. For example, the name "Customer_ID"
037     * is encoded as "Customer_x0020_ID".
038     * </p>
039     * <p>
040     * Decoding transforms every <code>_xHHHH_</code> encoding sequences back into the UTF-16 character. Note that
041     * {@link #decode(String) decoding} can be safely done on any XML name, even if the name does not contain any encoded sequences.
042     * </p>
043     * 
044     * @author Randall Hauch
045     */
046    public class XmlNameEncoder implements TextDecoder, TextEncoder {
047    
048        private static final BitSet XML_NAME_ALLOWED_CHARACTERS = new BitSet(2 ^ 16);
049    
050        static {
051            // Initialize the unescaped bitset ...
052    
053            // XML Names may contain: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
054            XML_NAME_ALLOWED_CHARACTERS.set('.');
055            XML_NAME_ALLOWED_CHARACTERS.set('-');
056            XML_NAME_ALLOWED_CHARACTERS.set('_');
057            XML_NAME_ALLOWED_CHARACTERS.set(':');
058    
059            // XML Base Character Set
060            XML_NAME_ALLOWED_CHARACTERS.set('\u0041', '\u005A' + 1);
061            XML_NAME_ALLOWED_CHARACTERS.set('\u0061', '\u007A' + 1);
062            XML_NAME_ALLOWED_CHARACTERS.set('\u00C0', '\u00D6' + 1);
063            XML_NAME_ALLOWED_CHARACTERS.set('\u00D8', '\u00F6' + 1);
064            XML_NAME_ALLOWED_CHARACTERS.set('\u00F8', '\u00FF' + 1);
065            XML_NAME_ALLOWED_CHARACTERS.set('\u0100', '\u0131' + 1);
066            XML_NAME_ALLOWED_CHARACTERS.set('\u0134', '\u013E' + 1);
067            XML_NAME_ALLOWED_CHARACTERS.set('\u0141', '\u0148' + 1);
068            XML_NAME_ALLOWED_CHARACTERS.set('\u014A', '\u017E' + 1);
069            XML_NAME_ALLOWED_CHARACTERS.set('\u0180', '\u01C3' + 1);
070            XML_NAME_ALLOWED_CHARACTERS.set('\u01CD', '\u01F0' + 1);
071            XML_NAME_ALLOWED_CHARACTERS.set('\u01F4', '\u01F5' + 1);
072            XML_NAME_ALLOWED_CHARACTERS.set('\u01FA', '\u0217' + 1);
073            XML_NAME_ALLOWED_CHARACTERS.set('\u0250', '\u02A8' + 1);
074            XML_NAME_ALLOWED_CHARACTERS.set('\u02BB', '\u02C1' + 1);
075            XML_NAME_ALLOWED_CHARACTERS.set('\u0386');
076            XML_NAME_ALLOWED_CHARACTERS.set('\u0388', '\u038A' + 1);
077            XML_NAME_ALLOWED_CHARACTERS.set('\u038C');
078            XML_NAME_ALLOWED_CHARACTERS.set('\u038E', '\u03A1' + 1);
079            XML_NAME_ALLOWED_CHARACTERS.set('\u03A3', '\u03CE' + 1);
080            XML_NAME_ALLOWED_CHARACTERS.set('\u03D0', '\u03D6' + 1);
081            XML_NAME_ALLOWED_CHARACTERS.set('\u03DA');
082            XML_NAME_ALLOWED_CHARACTERS.set('\u03DC');
083            XML_NAME_ALLOWED_CHARACTERS.set('\u03DE');
084            XML_NAME_ALLOWED_CHARACTERS.set('\u03E0');
085            XML_NAME_ALLOWED_CHARACTERS.set('\u03E2', '\u03F3' + 1);
086            XML_NAME_ALLOWED_CHARACTERS.set('\u0401', '\u040C' + 1);
087            XML_NAME_ALLOWED_CHARACTERS.set('\u040E', '\u044F' + 1);
088            XML_NAME_ALLOWED_CHARACTERS.set('\u0451', '\u045C' + 1);
089            XML_NAME_ALLOWED_CHARACTERS.set('\u045E', '\u0481' + 1);
090            XML_NAME_ALLOWED_CHARACTERS.set('\u0490', '\u04C4' + 1);
091            XML_NAME_ALLOWED_CHARACTERS.set('\u04C7', '\u04C8' + 1);
092            XML_NAME_ALLOWED_CHARACTERS.set('\u04CB', '\u04CC' + 1);
093            XML_NAME_ALLOWED_CHARACTERS.set('\u04D0', '\u04EB' + 1);
094            XML_NAME_ALLOWED_CHARACTERS.set('\u04EE', '\u04F5' + 1);
095            XML_NAME_ALLOWED_CHARACTERS.set('\u04F8', '\u04F9' + 1);
096            XML_NAME_ALLOWED_CHARACTERS.set('\u0531', '\u0556' + 1);
097            XML_NAME_ALLOWED_CHARACTERS.set('\u0559');
098            XML_NAME_ALLOWED_CHARACTERS.set('\u0561', '\u0586' + 1);
099            XML_NAME_ALLOWED_CHARACTERS.set('\u05D0', '\u05EA' + 1);
100            XML_NAME_ALLOWED_CHARACTERS.set('\u05F0', '\u05F2' + 1);
101            XML_NAME_ALLOWED_CHARACTERS.set('\u0621', '\u063A' + 1);
102            XML_NAME_ALLOWED_CHARACTERS.set('\u0641', '\u064A' + 1);
103            XML_NAME_ALLOWED_CHARACTERS.set('\u0671', '\u06B7' + 1);
104            XML_NAME_ALLOWED_CHARACTERS.set('\u06BA', '\u06BE' + 1);
105            XML_NAME_ALLOWED_CHARACTERS.set('\u06C0', '\u06CE' + 1);
106            XML_NAME_ALLOWED_CHARACTERS.set('\u06D0', '\u06D3' + 1);
107            XML_NAME_ALLOWED_CHARACTERS.set('\u06D5');
108            XML_NAME_ALLOWED_CHARACTERS.set('\u06E5', '\u06E6' + 1);
109            XML_NAME_ALLOWED_CHARACTERS.set('\u0905', '\u0939' + 1);
110            XML_NAME_ALLOWED_CHARACTERS.set('\u093D');
111            XML_NAME_ALLOWED_CHARACTERS.set('\u0958', '\u0961' + 1);
112            XML_NAME_ALLOWED_CHARACTERS.set('\u0985', '\u098C' + 1);
113            XML_NAME_ALLOWED_CHARACTERS.set('\u098F', '\u0990' + 1);
114            XML_NAME_ALLOWED_CHARACTERS.set('\u0993', '\u09A8' + 1);
115            XML_NAME_ALLOWED_CHARACTERS.set('\u09AA', '\u09B0' + 1);
116            XML_NAME_ALLOWED_CHARACTERS.set('\u09B2');
117            XML_NAME_ALLOWED_CHARACTERS.set('\u09B6', '\u09B9' + 1);
118            XML_NAME_ALLOWED_CHARACTERS.set('\u09DC', '\u09DD' + 1);
119            XML_NAME_ALLOWED_CHARACTERS.set('\u09DF', '\u09E1' + 1);
120            XML_NAME_ALLOWED_CHARACTERS.set('\u09F0', '\u09F1' + 1);
121            XML_NAME_ALLOWED_CHARACTERS.set('\u0A05', '\u0A0A' + 1);
122            XML_NAME_ALLOWED_CHARACTERS.set('\u0A0F', '\u0A10' + 1);
123            XML_NAME_ALLOWED_CHARACTERS.set('\u0A13', '\u0A28' + 1);
124            XML_NAME_ALLOWED_CHARACTERS.set('\u0A2A', '\u0A30' + 1);
125            XML_NAME_ALLOWED_CHARACTERS.set('\u0A32', '\u0A33' + 1);
126            XML_NAME_ALLOWED_CHARACTERS.set('\u0A35', '\u0A36' + 1);
127            XML_NAME_ALLOWED_CHARACTERS.set('\u0A38', '\u0A39' + 1);
128            XML_NAME_ALLOWED_CHARACTERS.set('\u0A59', '\u0A5C' + 1);
129            XML_NAME_ALLOWED_CHARACTERS.set('\u0A5E');
130            XML_NAME_ALLOWED_CHARACTERS.set('\u0A72', '\u0A74' + 1);
131            XML_NAME_ALLOWED_CHARACTERS.set('\u0A85', '\u0A8B' + 1);
132            XML_NAME_ALLOWED_CHARACTERS.set('\u0A8D');
133            XML_NAME_ALLOWED_CHARACTERS.set('\u0A8F', '\u0A91' + 1);
134            XML_NAME_ALLOWED_CHARACTERS.set('\u0A93', '\u0AA8' + 1);
135            XML_NAME_ALLOWED_CHARACTERS.set('\u0AAA', '\u0AB0' + 1);
136            XML_NAME_ALLOWED_CHARACTERS.set('\u0AB2', '\u0AB3' + 1);
137            XML_NAME_ALLOWED_CHARACTERS.set('\u0AB5', '\u0AB9' + 1);
138            XML_NAME_ALLOWED_CHARACTERS.set('\u0ABD');
139            XML_NAME_ALLOWED_CHARACTERS.set('\u0AE0');
140            XML_NAME_ALLOWED_CHARACTERS.set('\u0B05', '\u0B0C' + 1);
141            XML_NAME_ALLOWED_CHARACTERS.set('\u0B0F', '\u0B10' + 1);
142            XML_NAME_ALLOWED_CHARACTERS.set('\u0B13', '\u0B28' + 1);
143            XML_NAME_ALLOWED_CHARACTERS.set('\u0B2A', '\u0B30' + 1);
144            XML_NAME_ALLOWED_CHARACTERS.set('\u0B32', '\u0B33' + 1);
145            XML_NAME_ALLOWED_CHARACTERS.set('\u0B36', '\u0B39' + 1);
146            XML_NAME_ALLOWED_CHARACTERS.set('\u0B3D');
147            XML_NAME_ALLOWED_CHARACTERS.set('\u0B5C', '\u0B5D' + 1);
148            XML_NAME_ALLOWED_CHARACTERS.set('\u0B5F', '\u0B61' + 1);
149            XML_NAME_ALLOWED_CHARACTERS.set('\u0B85', '\u0B8A' + 1);
150            XML_NAME_ALLOWED_CHARACTERS.set('\u0B8E', '\u0B90' + 1);
151            XML_NAME_ALLOWED_CHARACTERS.set('\u0B92', '\u0B95' + 1);
152            XML_NAME_ALLOWED_CHARACTERS.set('\u0B99', '\u0B9A' + 1);
153            XML_NAME_ALLOWED_CHARACTERS.set('\u0B9C');
154            XML_NAME_ALLOWED_CHARACTERS.set('\u0B9E', '\u0B9F' + 1);
155            XML_NAME_ALLOWED_CHARACTERS.set('\u0BA3', '\u0BA4' + 1);
156            XML_NAME_ALLOWED_CHARACTERS.set('\u0BA8', '\u0BAA' + 1);
157            XML_NAME_ALLOWED_CHARACTERS.set('\u0BAE', '\u0BB5' + 1);
158            XML_NAME_ALLOWED_CHARACTERS.set('\u0BB7', '\u0BB9' + 1);
159            XML_NAME_ALLOWED_CHARACTERS.set('\u0C05', '\u0C0C' + 1);
160            XML_NAME_ALLOWED_CHARACTERS.set('\u0C0E', '\u0C10' + 1);
161            XML_NAME_ALLOWED_CHARACTERS.set('\u0C12', '\u0C28' + 1);
162            XML_NAME_ALLOWED_CHARACTERS.set('\u0C2A', '\u0C33' + 1);
163            XML_NAME_ALLOWED_CHARACTERS.set('\u0C35', '\u0C39' + 1);
164            XML_NAME_ALLOWED_CHARACTERS.set('\u0C60', '\u0C61' + 1);
165            XML_NAME_ALLOWED_CHARACTERS.set('\u0C85', '\u0C8C' + 1);
166            XML_NAME_ALLOWED_CHARACTERS.set('\u0C8E', '\u0C90' + 1);
167            XML_NAME_ALLOWED_CHARACTERS.set('\u0C92', '\u0CA8' + 1);
168            XML_NAME_ALLOWED_CHARACTERS.set('\u0CAA', '\u0CB3' + 1);
169            XML_NAME_ALLOWED_CHARACTERS.set('\u0CB5', '\u0CB9' + 1);
170            XML_NAME_ALLOWED_CHARACTERS.set('\u0CDE');
171            XML_NAME_ALLOWED_CHARACTERS.set('\u0CE0', '\u0CE1' + 1);
172            XML_NAME_ALLOWED_CHARACTERS.set('\u0D05', '\u0D0C' + 1);
173            XML_NAME_ALLOWED_CHARACTERS.set('\u0D0E', '\u0D10' + 1);
174            XML_NAME_ALLOWED_CHARACTERS.set('\u0D12', '\u0D28' + 1);
175            XML_NAME_ALLOWED_CHARACTERS.set('\u0D2A', '\u0D39' + 1);
176            XML_NAME_ALLOWED_CHARACTERS.set('\u0D60', '\u0D61' + 1);
177            XML_NAME_ALLOWED_CHARACTERS.set('\u0E01', '\u0E2E' + 1);
178            XML_NAME_ALLOWED_CHARACTERS.set('\u0E30');
179            XML_NAME_ALLOWED_CHARACTERS.set('\u0E32', '\u0E33' + 1);
180            XML_NAME_ALLOWED_CHARACTERS.set('\u0E40', '\u0E45' + 1);
181            XML_NAME_ALLOWED_CHARACTERS.set('\u0E81', '\u0E82' + 1);
182            XML_NAME_ALLOWED_CHARACTERS.set('\u0E84');
183            XML_NAME_ALLOWED_CHARACTERS.set('\u0E87', '\u0E88' + 1);
184            XML_NAME_ALLOWED_CHARACTERS.set('\u0E8A');
185            XML_NAME_ALLOWED_CHARACTERS.set('\u0E8D');
186            XML_NAME_ALLOWED_CHARACTERS.set('\u0E94', '\u0E97' + 1);
187            XML_NAME_ALLOWED_CHARACTERS.set('\u0E99', '\u0E9F' + 1);
188            XML_NAME_ALLOWED_CHARACTERS.set('\u0EA1', '\u0EA3' + 1);
189            XML_NAME_ALLOWED_CHARACTERS.set('\u0EA5');
190            XML_NAME_ALLOWED_CHARACTERS.set('\u0EA7');
191            XML_NAME_ALLOWED_CHARACTERS.set('\u0EAA', '\u0EAB' + 1);
192            XML_NAME_ALLOWED_CHARACTERS.set('\u0EAD', '\u0EAE' + 1);
193            XML_NAME_ALLOWED_CHARACTERS.set('\u0EB0');
194            XML_NAME_ALLOWED_CHARACTERS.set('\u0EB2', '\u0EB3' + 1);
195            XML_NAME_ALLOWED_CHARACTERS.set('\u0EBD');
196            XML_NAME_ALLOWED_CHARACTERS.set('\u0EC0', '\u0EC4' + 1);
197            XML_NAME_ALLOWED_CHARACTERS.set('\u0F40', '\u0F47' + 1);
198            XML_NAME_ALLOWED_CHARACTERS.set('\u0F49', '\u0F69' + 1);
199            XML_NAME_ALLOWED_CHARACTERS.set('\u10A0', '\u10C5' + 1);
200            XML_NAME_ALLOWED_CHARACTERS.set('\u10D0', '\u10F6' + 1);
201            XML_NAME_ALLOWED_CHARACTERS.set('\u1100');
202            XML_NAME_ALLOWED_CHARACTERS.set('\u1102', '\u1103' + 1);
203            XML_NAME_ALLOWED_CHARACTERS.set('\u1105', '\u1107' + 1);
204            XML_NAME_ALLOWED_CHARACTERS.set('\u1109');
205            XML_NAME_ALLOWED_CHARACTERS.set('\u110B', '\u110C' + 1);
206            XML_NAME_ALLOWED_CHARACTERS.set('\u110E', '\u1112' + 1);
207            XML_NAME_ALLOWED_CHARACTERS.set('\u113C');
208            XML_NAME_ALLOWED_CHARACTERS.set('\u113E');
209            XML_NAME_ALLOWED_CHARACTERS.set('\u1140');
210            XML_NAME_ALLOWED_CHARACTERS.set('\u114C');
211            XML_NAME_ALLOWED_CHARACTERS.set('\u114E');
212            XML_NAME_ALLOWED_CHARACTERS.set('\u1150');
213            XML_NAME_ALLOWED_CHARACTERS.set('\u1154', '\u1155' + 1);
214            XML_NAME_ALLOWED_CHARACTERS.set('\u1159');
215            XML_NAME_ALLOWED_CHARACTERS.set('\u115F', '\u1161' + 1);
216            XML_NAME_ALLOWED_CHARACTERS.set('\u1163');
217            XML_NAME_ALLOWED_CHARACTERS.set('\u1165');
218            XML_NAME_ALLOWED_CHARACTERS.set('\u1167');
219            XML_NAME_ALLOWED_CHARACTERS.set('\u1169');
220            XML_NAME_ALLOWED_CHARACTERS.set('\u116D', '\u116E' + 1);
221            XML_NAME_ALLOWED_CHARACTERS.set('\u1172', '\u1173' + 1);
222            XML_NAME_ALLOWED_CHARACTERS.set('\u1175');
223            XML_NAME_ALLOWED_CHARACTERS.set('\u119E');
224            XML_NAME_ALLOWED_CHARACTERS.set('\u11A8');
225            XML_NAME_ALLOWED_CHARACTERS.set('\u11AB');
226            XML_NAME_ALLOWED_CHARACTERS.set('\u11AE', '\u11AF' + 1);
227            XML_NAME_ALLOWED_CHARACTERS.set('\u11B7', '\u11B8' + 1);
228            XML_NAME_ALLOWED_CHARACTERS.set('\u11BA');
229            XML_NAME_ALLOWED_CHARACTERS.set('\u11BC', '\u11C2' + 1);
230            XML_NAME_ALLOWED_CHARACTERS.set('\u11EB');
231            XML_NAME_ALLOWED_CHARACTERS.set('\u11F0');
232            XML_NAME_ALLOWED_CHARACTERS.set('\u11F9');
233            XML_NAME_ALLOWED_CHARACTERS.set('\u1E00', '\u1E9B' + 1);
234            XML_NAME_ALLOWED_CHARACTERS.set('\u1EA0', '\u1EF9' + 1);
235            XML_NAME_ALLOWED_CHARACTERS.set('\u1F00', '\u1F15' + 1);
236            XML_NAME_ALLOWED_CHARACTERS.set('\u1F18', '\u1F1D' + 1);
237            XML_NAME_ALLOWED_CHARACTERS.set('\u1F20', '\u1F45' + 1);
238            XML_NAME_ALLOWED_CHARACTERS.set('\u1F48', '\u1F4D' + 1);
239            XML_NAME_ALLOWED_CHARACTERS.set('\u1F50', '\u1F57' + 1);
240            XML_NAME_ALLOWED_CHARACTERS.set('\u1F59');
241            XML_NAME_ALLOWED_CHARACTERS.set('\u1F5B');
242            XML_NAME_ALLOWED_CHARACTERS.set('\u1F5D');
243            XML_NAME_ALLOWED_CHARACTERS.set('\u1F5F', '\u1F7D' + 1);
244            XML_NAME_ALLOWED_CHARACTERS.set('\u1F80', '\u1FB4' + 1);
245            XML_NAME_ALLOWED_CHARACTERS.set('\u1FB6', '\u1FBC' + 1);
246            XML_NAME_ALLOWED_CHARACTERS.set('\u1FBE');
247            XML_NAME_ALLOWED_CHARACTERS.set('\u1FC2', '\u1FC4' + 1);
248            XML_NAME_ALLOWED_CHARACTERS.set('\u1FC6', '\u1FCC' + 1);
249            XML_NAME_ALLOWED_CHARACTERS.set('\u1FD0', '\u1FD3' + 1);
250            XML_NAME_ALLOWED_CHARACTERS.set('\u1FD6', '\u1FDB' + 1);
251            XML_NAME_ALLOWED_CHARACTERS.set('\u1FE0', '\u1FEC' + 1);
252            XML_NAME_ALLOWED_CHARACTERS.set('\u1FF2', '\u1FF4' + 1);
253            XML_NAME_ALLOWED_CHARACTERS.set('\u1FF6', '\u1FFC' + 1);
254            XML_NAME_ALLOWED_CHARACTERS.set('\u2126');
255            XML_NAME_ALLOWED_CHARACTERS.set('\u212A', '\u212B' + 1);
256            XML_NAME_ALLOWED_CHARACTERS.set('\u212E');
257            XML_NAME_ALLOWED_CHARACTERS.set('\u2180', '\u2182' + 1);
258            XML_NAME_ALLOWED_CHARACTERS.set('\u3041', '\u3094' + 1);
259            XML_NAME_ALLOWED_CHARACTERS.set('\u30A1', '\u30FA' + 1);
260            XML_NAME_ALLOWED_CHARACTERS.set('\u3105', '\u312C' + 1);
261            XML_NAME_ALLOWED_CHARACTERS.set('\uAC00', '\uD7A3' + 1);
262    
263            // XML Ideograph Character Set
264    
265            XML_NAME_ALLOWED_CHARACTERS.set('\u4E00', '\u9FA5' + 1);
266            XML_NAME_ALLOWED_CHARACTERS.set('\u3007');
267            XML_NAME_ALLOWED_CHARACTERS.set('\u3021', '\u3029' + 1);
268    
269            // XML Combining Character Set
270    
271            XML_NAME_ALLOWED_CHARACTERS.set('\u0300', '\u0345' + 1);
272            XML_NAME_ALLOWED_CHARACTERS.set('\u0360', '\u0361' + 1);
273            XML_NAME_ALLOWED_CHARACTERS.set('\u0483', '\u0486' + 1);
274            XML_NAME_ALLOWED_CHARACTERS.set('\u0591', '\u05A1' + 1);
275            XML_NAME_ALLOWED_CHARACTERS.set('\u05A3', '\u05B9' + 1);
276            XML_NAME_ALLOWED_CHARACTERS.set('\u05BB', '\u05BD' + 1);
277            XML_NAME_ALLOWED_CHARACTERS.set('\u05BF');
278            XML_NAME_ALLOWED_CHARACTERS.set('\u05C1', '\u05C2' + 1);
279            XML_NAME_ALLOWED_CHARACTERS.set('\u05C4');
280            XML_NAME_ALLOWED_CHARACTERS.set('\u064B', '\u0652' + 1);
281            XML_NAME_ALLOWED_CHARACTERS.set('\u0670');
282            XML_NAME_ALLOWED_CHARACTERS.set('\u06D6', '\u06DC' + 1);
283            XML_NAME_ALLOWED_CHARACTERS.set('\u06DD', '\u06DF' + 1);
284            XML_NAME_ALLOWED_CHARACTERS.set('\u06E0', '\u06E4' + 1);
285            XML_NAME_ALLOWED_CHARACTERS.set('\u06E7', '\u06E8' + 1);
286            XML_NAME_ALLOWED_CHARACTERS.set('\u06EA', '\u06ED' + 1);
287            XML_NAME_ALLOWED_CHARACTERS.set('\u0901', '\u0903' + 1);
288            XML_NAME_ALLOWED_CHARACTERS.set('\u093C');
289            XML_NAME_ALLOWED_CHARACTERS.set('\u093E', '\u094C' + 1);
290            XML_NAME_ALLOWED_CHARACTERS.set('\u094D');
291            XML_NAME_ALLOWED_CHARACTERS.set('\u0951', '\u0954' + 1);
292            XML_NAME_ALLOWED_CHARACTERS.set('\u0962', '\u0963' + 1);
293            XML_NAME_ALLOWED_CHARACTERS.set('\u0981', '\u0983' + 1);
294            XML_NAME_ALLOWED_CHARACTERS.set('\u09BC');
295            XML_NAME_ALLOWED_CHARACTERS.set('\u09BE');
296            XML_NAME_ALLOWED_CHARACTERS.set('\u09BF');
297            XML_NAME_ALLOWED_CHARACTERS.set('\u09C0', '\u09C4' + 1);
298            XML_NAME_ALLOWED_CHARACTERS.set('\u09C7', '\u09C8' + 1);
299            XML_NAME_ALLOWED_CHARACTERS.set('\u09CB', '\u09CD' + 1);
300            XML_NAME_ALLOWED_CHARACTERS.set('\u09D7');
301            XML_NAME_ALLOWED_CHARACTERS.set('\u09E2', '\u09E3' + 1);
302            XML_NAME_ALLOWED_CHARACTERS.set('\u0A02');
303            XML_NAME_ALLOWED_CHARACTERS.set('\u0A3C');
304            XML_NAME_ALLOWED_CHARACTERS.set('\u0A3E');
305            XML_NAME_ALLOWED_CHARACTERS.set('\u0A3F');
306            XML_NAME_ALLOWED_CHARACTERS.set('\u0A40', '\u0A42' + 1);
307            XML_NAME_ALLOWED_CHARACTERS.set('\u0A47', '\u0A48' + 1);
308            XML_NAME_ALLOWED_CHARACTERS.set('\u0A4B', '\u0A4D' + 1);
309            XML_NAME_ALLOWED_CHARACTERS.set('\u0A70', '\u0A71' + 1);
310            XML_NAME_ALLOWED_CHARACTERS.set('\u0A81', '\u0A83' + 1);
311            XML_NAME_ALLOWED_CHARACTERS.set('\u0ABC');
312            XML_NAME_ALLOWED_CHARACTERS.set('\u0ABE', '\u0AC5' + 1);
313            XML_NAME_ALLOWED_CHARACTERS.set('\u0AC7', '\u0AC9' + 1);
314            XML_NAME_ALLOWED_CHARACTERS.set('\u0ACB', '\u0ACD' + 1);
315            XML_NAME_ALLOWED_CHARACTERS.set('\u0B01', '\u0B03' + 1);
316            XML_NAME_ALLOWED_CHARACTERS.set('\u0B3C');
317            XML_NAME_ALLOWED_CHARACTERS.set('\u0B3E', '\u0B43' + 1);
318            XML_NAME_ALLOWED_CHARACTERS.set('\u0B47', '\u0B48' + 1);
319            XML_NAME_ALLOWED_CHARACTERS.set('\u0B4B', '\u0B4D' + 1);
320            XML_NAME_ALLOWED_CHARACTERS.set('\u0B56', '\u0B57' + 1);
321            XML_NAME_ALLOWED_CHARACTERS.set('\u0B82', '\u0B83' + 1);
322            XML_NAME_ALLOWED_CHARACTERS.set('\u0BBE', '\u0BC2' + 1);
323            XML_NAME_ALLOWED_CHARACTERS.set('\u0BC6', '\u0BC8' + 1);
324            XML_NAME_ALLOWED_CHARACTERS.set('\u0BCA', '\u0BCD' + 1);
325            XML_NAME_ALLOWED_CHARACTERS.set('\u0BD7');
326            XML_NAME_ALLOWED_CHARACTERS.set('\u0C01', '\u0C03' + 1);
327            XML_NAME_ALLOWED_CHARACTERS.set('\u0C3E', '\u0C44' + 1);
328            XML_NAME_ALLOWED_CHARACTERS.set('\u0C46', '\u0C48' + 1);
329            XML_NAME_ALLOWED_CHARACTERS.set('\u0C4A', '\u0C4D' + 1);
330            XML_NAME_ALLOWED_CHARACTERS.set('\u0C55', '\u0C56' + 1);
331            XML_NAME_ALLOWED_CHARACTERS.set('\u0C82', '\u0C83' + 1);
332            XML_NAME_ALLOWED_CHARACTERS.set('\u0CBE', '\u0CC4' + 1);
333            XML_NAME_ALLOWED_CHARACTERS.set('\u0CC6', '\u0CC8' + 1);
334            XML_NAME_ALLOWED_CHARACTERS.set('\u0CCA', '\u0CCD' + 1);
335            XML_NAME_ALLOWED_CHARACTERS.set('\u0CD5', '\u0CD6' + 1);
336            XML_NAME_ALLOWED_CHARACTERS.set('\u0D02', '\u0D03' + 1);
337            XML_NAME_ALLOWED_CHARACTERS.set('\u0D3E', '\u0D43' + 1);
338            XML_NAME_ALLOWED_CHARACTERS.set('\u0D46', '\u0D48' + 1);
339            XML_NAME_ALLOWED_CHARACTERS.set('\u0D4A', '\u0D4D' + 1);
340            XML_NAME_ALLOWED_CHARACTERS.set('\u0D57');
341            XML_NAME_ALLOWED_CHARACTERS.set('\u0E31');
342            XML_NAME_ALLOWED_CHARACTERS.set('\u0E34', '\u0E3A' + 1);
343            XML_NAME_ALLOWED_CHARACTERS.set('\u0E47', '\u0E4E' + 1);
344            XML_NAME_ALLOWED_CHARACTERS.set('\u0EB1');
345            XML_NAME_ALLOWED_CHARACTERS.set('\u0EB4', '\u0EB9' + 1);
346            XML_NAME_ALLOWED_CHARACTERS.set('\u0EBB', '\u0EBC' + 1);
347            XML_NAME_ALLOWED_CHARACTERS.set('\u0EC8', '\u0ECD' + 1);
348            XML_NAME_ALLOWED_CHARACTERS.set('\u0F18', '\u0F19' + 1);
349            XML_NAME_ALLOWED_CHARACTERS.set('\u0F35');
350            XML_NAME_ALLOWED_CHARACTERS.set('\u0F37');
351            XML_NAME_ALLOWED_CHARACTERS.set('\u0F39');
352            XML_NAME_ALLOWED_CHARACTERS.set('\u0F3E');
353            XML_NAME_ALLOWED_CHARACTERS.set('\u0F3F');
354            XML_NAME_ALLOWED_CHARACTERS.set('\u0F71', '\u0F84' + 1);
355            XML_NAME_ALLOWED_CHARACTERS.set('\u0F86', '\u0F8B' + 1);
356            XML_NAME_ALLOWED_CHARACTERS.set('\u0F90', '\u0F95' + 1);
357            XML_NAME_ALLOWED_CHARACTERS.set('\u0F97');
358            XML_NAME_ALLOWED_CHARACTERS.set('\u0F99', '\u0FAD' + 1);
359            XML_NAME_ALLOWED_CHARACTERS.set('\u0FB1', '\u0FB7' + 1);
360            XML_NAME_ALLOWED_CHARACTERS.set('\u0FB9');
361            XML_NAME_ALLOWED_CHARACTERS.set('\u20D0', '\u20DC' + 1);
362            XML_NAME_ALLOWED_CHARACTERS.set('\u20E1');
363            XML_NAME_ALLOWED_CHARACTERS.set('\u302A', '\u302F' + 1);
364            XML_NAME_ALLOWED_CHARACTERS.set('\u3099');
365            XML_NAME_ALLOWED_CHARACTERS.set('\u309A');
366    
367            // XML Digits
368            XML_NAME_ALLOWED_CHARACTERS.set('\u0030', '\u0039' + 1);
369            XML_NAME_ALLOWED_CHARACTERS.set('\u0660', '\u0669' + 1);
370            XML_NAME_ALLOWED_CHARACTERS.set('\u06F0', '\u06F9' + 1);
371            XML_NAME_ALLOWED_CHARACTERS.set('\u0966', '\u096F' + 1);
372            XML_NAME_ALLOWED_CHARACTERS.set('\u09E6', '\u09EF' + 1);
373            XML_NAME_ALLOWED_CHARACTERS.set('\u0A66', '\u0A6F' + 1);
374            XML_NAME_ALLOWED_CHARACTERS.set('\u0AE6', '\u0AEF' + 1);
375            XML_NAME_ALLOWED_CHARACTERS.set('\u0B66', '\u0B6F' + 1);
376            XML_NAME_ALLOWED_CHARACTERS.set('\u0BE7', '\u0BEF' + 1);
377            XML_NAME_ALLOWED_CHARACTERS.set('\u0C66', '\u0C6F' + 1);
378            XML_NAME_ALLOWED_CHARACTERS.set('\u0CE6', '\u0CEF' + 1);
379            XML_NAME_ALLOWED_CHARACTERS.set('\u0D66', '\u0D6F' + 1);
380            XML_NAME_ALLOWED_CHARACTERS.set('\u0E50', '\u0E59' + 1);
381            XML_NAME_ALLOWED_CHARACTERS.set('\u0ED0', '\u0ED9' + 1);
382            XML_NAME_ALLOWED_CHARACTERS.set('\u0F20', '\u0F29' + 1);
383    
384            // XML Extenders
385            XML_NAME_ALLOWED_CHARACTERS.set('\u00B7');
386            XML_NAME_ALLOWED_CHARACTERS.set('\u02D0');
387            XML_NAME_ALLOWED_CHARACTERS.set('\u02D1');
388            XML_NAME_ALLOWED_CHARACTERS.set('\u0387');
389            XML_NAME_ALLOWED_CHARACTERS.set('\u0640');
390            XML_NAME_ALLOWED_CHARACTERS.set('\u0E46');
391            XML_NAME_ALLOWED_CHARACTERS.set('\u0EC6');
392            XML_NAME_ALLOWED_CHARACTERS.set('\u3005');
393            XML_NAME_ALLOWED_CHARACTERS.set('\u3031', '\u3035' + 1);
394            XML_NAME_ALLOWED_CHARACTERS.set('\u309D', '\u309E' + 1);
395            XML_NAME_ALLOWED_CHARACTERS.set('\u30FC', '\u30FE' + 1);
396        }
397    
398        /**
399         * {@inheritDoc}
400         * 
401         * @see org.jboss.dna.common.text.TextDecoder#decode(java.lang.String)
402         */
403        public String decode( String encodedText ) {
404            if (encodedText == null) return null;
405            if (encodedText.length() < 7) {
406                // Not big enough to have an encoded sequence
407                return encodedText;
408            }
409            StringBuilder sb = new StringBuilder();
410            char[] digits = new char[4];
411            CharacterIterator iter = new StringCharacterIterator(encodedText);
412            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
413                if (c == '_') {
414                    // Read the next character, if there is one ...
415                    char next = iter.next();
416                    if (next == CharacterIterator.DONE) {
417                        sb.append(c);
418                        break;
419                    }
420                    // If the next character is not 'x', then these are just regular characters ...
421                    if (next != 'x') {
422                        sb.append(c).append(next);
423                        continue;
424                    }
425                    // Read the next 4 characters (digits) and another '_' character ...
426                    digits[0] = iter.next();
427                    if (digits[0] == CharacterIterator.DONE) {
428                        sb.append(c).append(next);
429                        break;
430                    }
431                    digits[1] = iter.next();
432                    if (digits[1] == CharacterIterator.DONE) {
433                        sb.append(c).append(next).append(digits, 0, 1);
434                        break;
435                    }
436                    digits[2] = iter.next();
437                    if (digits[2] == CharacterIterator.DONE) {
438                        sb.append(c).append(next).append(digits, 0, 2);
439                        break;
440                    }
441                    digits[3] = iter.next();
442                    if (digits[3] == CharacterIterator.DONE) {
443                        sb.append(c).append(next).append(digits, 0, 3);
444                        break;
445                    }
446                    char underscore = iter.next();
447                    if (underscore != '_') { // includes DONE
448                        sb.append(c).append(next).append(digits, 0, 4);
449                        if (underscore == CharacterIterator.DONE) break;
450                        sb.append(underscore);
451                        continue;
452                    }
453                    // We've read all 4 digits, including the trailing '_'
454                    // Now parse into the resulting character
455                    try {
456                        sb.appendCodePoint(Integer.parseInt(new String(digits), 16));
457                    } catch (NumberFormatException e) {
458                        // code was not hexadecimal, so just write out the characters as is ...
459                        sb.append(c).append(next).append(digits).append(underscore);
460                    }
461                } else {
462                    // Just append other characters ...
463                    sb.append(c);
464                }
465            }
466            return sb.toString();
467        }
468    
469        /**
470         * {@inheritDoc}
471         * 
472         * @see org.jboss.dna.common.text.TextEncoder#encode(java.lang.String)
473         */
474        public String encode( String text ) {
475            if (text == null) return null;
476            if (text.length() == 0) return text;
477            StringBuilder sb = new StringBuilder();
478            String hex = null;
479            CharacterIterator iter = new StringCharacterIterator(text);
480            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
481                if (c == '_') {
482                    // Read the next character (if there is one) ...
483                    char next = iter.next();
484                    if (next == CharacterIterator.DONE) {
485                        sb.append(c);
486                        break;
487                    }
488                    // If the next character is not 'x', then these are just regular characters ...
489                    if (next != 'x') {
490                        sb.append(c).append(next);
491                        continue;
492                    }
493                    // The next character is 'x', so write out the '_' character in encoded form ...
494                    sb.append("_x005f_");
495                    // And then write out the next character ...
496                    sb.append(next);
497                } else if (XML_NAME_ALLOWED_CHARACTERS.get(c)) {
498                    // Legal characters for an XML Name ...
499                    sb.append(c);
500                } else {
501                    // All other characters must be escaped with '_xHHHH_' where 'HHHH' is the hex string for the code point
502                    hex = Integer.toHexString(c);
503                    // The hex string excludes the leading '0's, so check the character values so we know how many to prepend
504                    if (c >= '\u0000' && c <= '\u000f') {
505                        sb.append("_x000").append(hex);
506                    } else if (c >= '\u0010' && c <= '\u00ff') {
507                        sb.append("_x00").append(hex);
508                    } else if (c >= '\u0100' && c <= '\u0fff') {
509                        sb.append("_x0").append(hex);
510                    } else {
511                        sb.append("_x").append(hex);
512                    }
513                    sb.append('_');
514                }
515            }
516            return sb.toString();
517        }
518    
519    }