001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA 010 * is licensed to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 package org.jboss.dna.common.text; 025 026 import java.text.CharacterIterator; 027 import java.text.StringCharacterIterator; 028 import java.util.BitSet; 029 030 /** 031 * An {@link TextEncoder encoder} and {@link TextDecoder decoder} for XML element and attribute names. 032 * <p> 033 * Any UTF-16 unicode character that is not a valid XML name character according to the <a 034 * href="http://www.w3.org/TR/REC-xml/#sec-common-syn">World Wide Web Consortium (W3C) Extensible Markup Language (XML) 1.0 035 * (Fourth Edition) Recommendation</a> is escaped as <code>_xHHHH_</code>, where <code>HHHH</code> stands for the four-digit 036 * hexadecimal UTF-16 unicode value for the character in the most significant bit first order. For example, the name "Customer_ID" 037 * is encoded as "Customer_x0020_ID". 038 * </p> 039 * <p> 040 * Decoding transforms every <code>_xHHHH_</code> encoding sequences back into the UTF-16 character. Note that 041 * {@link #decode(String) decoding} can be safely done on any XML name, even if the name does not contain any encoded sequences. 042 * </p> 043 * 044 * @author Randall Hauch 045 */ 046 public class XmlNameEncoder implements TextDecoder, TextEncoder { 047 048 private static final BitSet XML_NAME_ALLOWED_CHARACTERS = new BitSet(2 ^ 16); 049 050 static { 051 // Initialize the unescaped bitset ... 052 053 // XML Names may contain: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender 054 XML_NAME_ALLOWED_CHARACTERS.set('.'); 055 XML_NAME_ALLOWED_CHARACTERS.set('-'); 056 XML_NAME_ALLOWED_CHARACTERS.set('_'); 057 XML_NAME_ALLOWED_CHARACTERS.set(':'); 058 059 // XML Base Character Set 060 XML_NAME_ALLOWED_CHARACTERS.set('\u0041', '\u005A' + 1); 061 XML_NAME_ALLOWED_CHARACTERS.set('\u0061', '\u007A' + 1); 062 XML_NAME_ALLOWED_CHARACTERS.set('\u00C0', '\u00D6' + 1); 063 XML_NAME_ALLOWED_CHARACTERS.set('\u00D8', '\u00F6' + 1); 064 XML_NAME_ALLOWED_CHARACTERS.set('\u00F8', '\u00FF' + 1); 065 XML_NAME_ALLOWED_CHARACTERS.set('\u0100', '\u0131' + 1); 066 XML_NAME_ALLOWED_CHARACTERS.set('\u0134', '\u013E' + 1); 067 XML_NAME_ALLOWED_CHARACTERS.set('\u0141', '\u0148' + 1); 068 XML_NAME_ALLOWED_CHARACTERS.set('\u014A', '\u017E' + 1); 069 XML_NAME_ALLOWED_CHARACTERS.set('\u0180', '\u01C3' + 1); 070 XML_NAME_ALLOWED_CHARACTERS.set('\u01CD', '\u01F0' + 1); 071 XML_NAME_ALLOWED_CHARACTERS.set('\u01F4', '\u01F5' + 1); 072 XML_NAME_ALLOWED_CHARACTERS.set('\u01FA', '\u0217' + 1); 073 XML_NAME_ALLOWED_CHARACTERS.set('\u0250', '\u02A8' + 1); 074 XML_NAME_ALLOWED_CHARACTERS.set('\u02BB', '\u02C1' + 1); 075 XML_NAME_ALLOWED_CHARACTERS.set('\u0386'); 076 XML_NAME_ALLOWED_CHARACTERS.set('\u0388', '\u038A' + 1); 077 XML_NAME_ALLOWED_CHARACTERS.set('\u038C'); 078 XML_NAME_ALLOWED_CHARACTERS.set('\u038E', '\u03A1' + 1); 079 XML_NAME_ALLOWED_CHARACTERS.set('\u03A3', '\u03CE' + 1); 080 XML_NAME_ALLOWED_CHARACTERS.set('\u03D0', '\u03D6' + 1); 081 XML_NAME_ALLOWED_CHARACTERS.set('\u03DA'); 082 XML_NAME_ALLOWED_CHARACTERS.set('\u03DC'); 083 XML_NAME_ALLOWED_CHARACTERS.set('\u03DE'); 084 XML_NAME_ALLOWED_CHARACTERS.set('\u03E0'); 085 XML_NAME_ALLOWED_CHARACTERS.set('\u03E2', '\u03F3' + 1); 086 XML_NAME_ALLOWED_CHARACTERS.set('\u0401', '\u040C' + 1); 087 XML_NAME_ALLOWED_CHARACTERS.set('\u040E', '\u044F' + 1); 088 XML_NAME_ALLOWED_CHARACTERS.set('\u0451', '\u045C' + 1); 089 XML_NAME_ALLOWED_CHARACTERS.set('\u045E', '\u0481' + 1); 090 XML_NAME_ALLOWED_CHARACTERS.set('\u0490', '\u04C4' + 1); 091 XML_NAME_ALLOWED_CHARACTERS.set('\u04C7', '\u04C8' + 1); 092 XML_NAME_ALLOWED_CHARACTERS.set('\u04CB', '\u04CC' + 1); 093 XML_NAME_ALLOWED_CHARACTERS.set('\u04D0', '\u04EB' + 1); 094 XML_NAME_ALLOWED_CHARACTERS.set('\u04EE', '\u04F5' + 1); 095 XML_NAME_ALLOWED_CHARACTERS.set('\u04F8', '\u04F9' + 1); 096 XML_NAME_ALLOWED_CHARACTERS.set('\u0531', '\u0556' + 1); 097 XML_NAME_ALLOWED_CHARACTERS.set('\u0559'); 098 XML_NAME_ALLOWED_CHARACTERS.set('\u0561', '\u0586' + 1); 099 XML_NAME_ALLOWED_CHARACTERS.set('\u05D0', '\u05EA' + 1); 100 XML_NAME_ALLOWED_CHARACTERS.set('\u05F0', '\u05F2' + 1); 101 XML_NAME_ALLOWED_CHARACTERS.set('\u0621', '\u063A' + 1); 102 XML_NAME_ALLOWED_CHARACTERS.set('\u0641', '\u064A' + 1); 103 XML_NAME_ALLOWED_CHARACTERS.set('\u0671', '\u06B7' + 1); 104 XML_NAME_ALLOWED_CHARACTERS.set('\u06BA', '\u06BE' + 1); 105 XML_NAME_ALLOWED_CHARACTERS.set('\u06C0', '\u06CE' + 1); 106 XML_NAME_ALLOWED_CHARACTERS.set('\u06D0', '\u06D3' + 1); 107 XML_NAME_ALLOWED_CHARACTERS.set('\u06D5'); 108 XML_NAME_ALLOWED_CHARACTERS.set('\u06E5', '\u06E6' + 1); 109 XML_NAME_ALLOWED_CHARACTERS.set('\u0905', '\u0939' + 1); 110 XML_NAME_ALLOWED_CHARACTERS.set('\u093D'); 111 XML_NAME_ALLOWED_CHARACTERS.set('\u0958', '\u0961' + 1); 112 XML_NAME_ALLOWED_CHARACTERS.set('\u0985', '\u098C' + 1); 113 XML_NAME_ALLOWED_CHARACTERS.set('\u098F', '\u0990' + 1); 114 XML_NAME_ALLOWED_CHARACTERS.set('\u0993', '\u09A8' + 1); 115 XML_NAME_ALLOWED_CHARACTERS.set('\u09AA', '\u09B0' + 1); 116 XML_NAME_ALLOWED_CHARACTERS.set('\u09B2'); 117 XML_NAME_ALLOWED_CHARACTERS.set('\u09B6', '\u09B9' + 1); 118 XML_NAME_ALLOWED_CHARACTERS.set('\u09DC', '\u09DD' + 1); 119 XML_NAME_ALLOWED_CHARACTERS.set('\u09DF', '\u09E1' + 1); 120 XML_NAME_ALLOWED_CHARACTERS.set('\u09F0', '\u09F1' + 1); 121 XML_NAME_ALLOWED_CHARACTERS.set('\u0A05', '\u0A0A' + 1); 122 XML_NAME_ALLOWED_CHARACTERS.set('\u0A0F', '\u0A10' + 1); 123 XML_NAME_ALLOWED_CHARACTERS.set('\u0A13', '\u0A28' + 1); 124 XML_NAME_ALLOWED_CHARACTERS.set('\u0A2A', '\u0A30' + 1); 125 XML_NAME_ALLOWED_CHARACTERS.set('\u0A32', '\u0A33' + 1); 126 XML_NAME_ALLOWED_CHARACTERS.set('\u0A35', '\u0A36' + 1); 127 XML_NAME_ALLOWED_CHARACTERS.set('\u0A38', '\u0A39' + 1); 128 XML_NAME_ALLOWED_CHARACTERS.set('\u0A59', '\u0A5C' + 1); 129 XML_NAME_ALLOWED_CHARACTERS.set('\u0A5E'); 130 XML_NAME_ALLOWED_CHARACTERS.set('\u0A72', '\u0A74' + 1); 131 XML_NAME_ALLOWED_CHARACTERS.set('\u0A85', '\u0A8B' + 1); 132 XML_NAME_ALLOWED_CHARACTERS.set('\u0A8D'); 133 XML_NAME_ALLOWED_CHARACTERS.set('\u0A8F', '\u0A91' + 1); 134 XML_NAME_ALLOWED_CHARACTERS.set('\u0A93', '\u0AA8' + 1); 135 XML_NAME_ALLOWED_CHARACTERS.set('\u0AAA', '\u0AB0' + 1); 136 XML_NAME_ALLOWED_CHARACTERS.set('\u0AB2', '\u0AB3' + 1); 137 XML_NAME_ALLOWED_CHARACTERS.set('\u0AB5', '\u0AB9' + 1); 138 XML_NAME_ALLOWED_CHARACTERS.set('\u0ABD'); 139 XML_NAME_ALLOWED_CHARACTERS.set('\u0AE0'); 140 XML_NAME_ALLOWED_CHARACTERS.set('\u0B05', '\u0B0C' + 1); 141 XML_NAME_ALLOWED_CHARACTERS.set('\u0B0F', '\u0B10' + 1); 142 XML_NAME_ALLOWED_CHARACTERS.set('\u0B13', '\u0B28' + 1); 143 XML_NAME_ALLOWED_CHARACTERS.set('\u0B2A', '\u0B30' + 1); 144 XML_NAME_ALLOWED_CHARACTERS.set('\u0B32', '\u0B33' + 1); 145 XML_NAME_ALLOWED_CHARACTERS.set('\u0B36', '\u0B39' + 1); 146 XML_NAME_ALLOWED_CHARACTERS.set('\u0B3D'); 147 XML_NAME_ALLOWED_CHARACTERS.set('\u0B5C', '\u0B5D' + 1); 148 XML_NAME_ALLOWED_CHARACTERS.set('\u0B5F', '\u0B61' + 1); 149 XML_NAME_ALLOWED_CHARACTERS.set('\u0B85', '\u0B8A' + 1); 150 XML_NAME_ALLOWED_CHARACTERS.set('\u0B8E', '\u0B90' + 1); 151 XML_NAME_ALLOWED_CHARACTERS.set('\u0B92', '\u0B95' + 1); 152 XML_NAME_ALLOWED_CHARACTERS.set('\u0B99', '\u0B9A' + 1); 153 XML_NAME_ALLOWED_CHARACTERS.set('\u0B9C'); 154 XML_NAME_ALLOWED_CHARACTERS.set('\u0B9E', '\u0B9F' + 1); 155 XML_NAME_ALLOWED_CHARACTERS.set('\u0BA3', '\u0BA4' + 1); 156 XML_NAME_ALLOWED_CHARACTERS.set('\u0BA8', '\u0BAA' + 1); 157 XML_NAME_ALLOWED_CHARACTERS.set('\u0BAE', '\u0BB5' + 1); 158 XML_NAME_ALLOWED_CHARACTERS.set('\u0BB7', '\u0BB9' + 1); 159 XML_NAME_ALLOWED_CHARACTERS.set('\u0C05', '\u0C0C' + 1); 160 XML_NAME_ALLOWED_CHARACTERS.set('\u0C0E', '\u0C10' + 1); 161 XML_NAME_ALLOWED_CHARACTERS.set('\u0C12', '\u0C28' + 1); 162 XML_NAME_ALLOWED_CHARACTERS.set('\u0C2A', '\u0C33' + 1); 163 XML_NAME_ALLOWED_CHARACTERS.set('\u0C35', '\u0C39' + 1); 164 XML_NAME_ALLOWED_CHARACTERS.set('\u0C60', '\u0C61' + 1); 165 XML_NAME_ALLOWED_CHARACTERS.set('\u0C85', '\u0C8C' + 1); 166 XML_NAME_ALLOWED_CHARACTERS.set('\u0C8E', '\u0C90' + 1); 167 XML_NAME_ALLOWED_CHARACTERS.set('\u0C92', '\u0CA8' + 1); 168 XML_NAME_ALLOWED_CHARACTERS.set('\u0CAA', '\u0CB3' + 1); 169 XML_NAME_ALLOWED_CHARACTERS.set('\u0CB5', '\u0CB9' + 1); 170 XML_NAME_ALLOWED_CHARACTERS.set('\u0CDE'); 171 XML_NAME_ALLOWED_CHARACTERS.set('\u0CE0', '\u0CE1' + 1); 172 XML_NAME_ALLOWED_CHARACTERS.set('\u0D05', '\u0D0C' + 1); 173 XML_NAME_ALLOWED_CHARACTERS.set('\u0D0E', '\u0D10' + 1); 174 XML_NAME_ALLOWED_CHARACTERS.set('\u0D12', '\u0D28' + 1); 175 XML_NAME_ALLOWED_CHARACTERS.set('\u0D2A', '\u0D39' + 1); 176 XML_NAME_ALLOWED_CHARACTERS.set('\u0D60', '\u0D61' + 1); 177 XML_NAME_ALLOWED_CHARACTERS.set('\u0E01', '\u0E2E' + 1); 178 XML_NAME_ALLOWED_CHARACTERS.set('\u0E30'); 179 XML_NAME_ALLOWED_CHARACTERS.set('\u0E32', '\u0E33' + 1); 180 XML_NAME_ALLOWED_CHARACTERS.set('\u0E40', '\u0E45' + 1); 181 XML_NAME_ALLOWED_CHARACTERS.set('\u0E81', '\u0E82' + 1); 182 XML_NAME_ALLOWED_CHARACTERS.set('\u0E84'); 183 XML_NAME_ALLOWED_CHARACTERS.set('\u0E87', '\u0E88' + 1); 184 XML_NAME_ALLOWED_CHARACTERS.set('\u0E8A'); 185 XML_NAME_ALLOWED_CHARACTERS.set('\u0E8D'); 186 XML_NAME_ALLOWED_CHARACTERS.set('\u0E94', '\u0E97' + 1); 187 XML_NAME_ALLOWED_CHARACTERS.set('\u0E99', '\u0E9F' + 1); 188 XML_NAME_ALLOWED_CHARACTERS.set('\u0EA1', '\u0EA3' + 1); 189 XML_NAME_ALLOWED_CHARACTERS.set('\u0EA5'); 190 XML_NAME_ALLOWED_CHARACTERS.set('\u0EA7'); 191 XML_NAME_ALLOWED_CHARACTERS.set('\u0EAA', '\u0EAB' + 1); 192 XML_NAME_ALLOWED_CHARACTERS.set('\u0EAD', '\u0EAE' + 1); 193 XML_NAME_ALLOWED_CHARACTERS.set('\u0EB0'); 194 XML_NAME_ALLOWED_CHARACTERS.set('\u0EB2', '\u0EB3' + 1); 195 XML_NAME_ALLOWED_CHARACTERS.set('\u0EBD'); 196 XML_NAME_ALLOWED_CHARACTERS.set('\u0EC0', '\u0EC4' + 1); 197 XML_NAME_ALLOWED_CHARACTERS.set('\u0F40', '\u0F47' + 1); 198 XML_NAME_ALLOWED_CHARACTERS.set('\u0F49', '\u0F69' + 1); 199 XML_NAME_ALLOWED_CHARACTERS.set('\u10A0', '\u10C5' + 1); 200 XML_NAME_ALLOWED_CHARACTERS.set('\u10D0', '\u10F6' + 1); 201 XML_NAME_ALLOWED_CHARACTERS.set('\u1100'); 202 XML_NAME_ALLOWED_CHARACTERS.set('\u1102', '\u1103' + 1); 203 XML_NAME_ALLOWED_CHARACTERS.set('\u1105', '\u1107' + 1); 204 XML_NAME_ALLOWED_CHARACTERS.set('\u1109'); 205 XML_NAME_ALLOWED_CHARACTERS.set('\u110B', '\u110C' + 1); 206 XML_NAME_ALLOWED_CHARACTERS.set('\u110E', '\u1112' + 1); 207 XML_NAME_ALLOWED_CHARACTERS.set('\u113C'); 208 XML_NAME_ALLOWED_CHARACTERS.set('\u113E'); 209 XML_NAME_ALLOWED_CHARACTERS.set('\u1140'); 210 XML_NAME_ALLOWED_CHARACTERS.set('\u114C'); 211 XML_NAME_ALLOWED_CHARACTERS.set('\u114E'); 212 XML_NAME_ALLOWED_CHARACTERS.set('\u1150'); 213 XML_NAME_ALLOWED_CHARACTERS.set('\u1154', '\u1155' + 1); 214 XML_NAME_ALLOWED_CHARACTERS.set('\u1159'); 215 XML_NAME_ALLOWED_CHARACTERS.set('\u115F', '\u1161' + 1); 216 XML_NAME_ALLOWED_CHARACTERS.set('\u1163'); 217 XML_NAME_ALLOWED_CHARACTERS.set('\u1165'); 218 XML_NAME_ALLOWED_CHARACTERS.set('\u1167'); 219 XML_NAME_ALLOWED_CHARACTERS.set('\u1169'); 220 XML_NAME_ALLOWED_CHARACTERS.set('\u116D', '\u116E' + 1); 221 XML_NAME_ALLOWED_CHARACTERS.set('\u1172', '\u1173' + 1); 222 XML_NAME_ALLOWED_CHARACTERS.set('\u1175'); 223 XML_NAME_ALLOWED_CHARACTERS.set('\u119E'); 224 XML_NAME_ALLOWED_CHARACTERS.set('\u11A8'); 225 XML_NAME_ALLOWED_CHARACTERS.set('\u11AB'); 226 XML_NAME_ALLOWED_CHARACTERS.set('\u11AE', '\u11AF' + 1); 227 XML_NAME_ALLOWED_CHARACTERS.set('\u11B7', '\u11B8' + 1); 228 XML_NAME_ALLOWED_CHARACTERS.set('\u11BA'); 229 XML_NAME_ALLOWED_CHARACTERS.set('\u11BC', '\u11C2' + 1); 230 XML_NAME_ALLOWED_CHARACTERS.set('\u11EB'); 231 XML_NAME_ALLOWED_CHARACTERS.set('\u11F0'); 232 XML_NAME_ALLOWED_CHARACTERS.set('\u11F9'); 233 XML_NAME_ALLOWED_CHARACTERS.set('\u1E00', '\u1E9B' + 1); 234 XML_NAME_ALLOWED_CHARACTERS.set('\u1EA0', '\u1EF9' + 1); 235 XML_NAME_ALLOWED_CHARACTERS.set('\u1F00', '\u1F15' + 1); 236 XML_NAME_ALLOWED_CHARACTERS.set('\u1F18', '\u1F1D' + 1); 237 XML_NAME_ALLOWED_CHARACTERS.set('\u1F20', '\u1F45' + 1); 238 XML_NAME_ALLOWED_CHARACTERS.set('\u1F48', '\u1F4D' + 1); 239 XML_NAME_ALLOWED_CHARACTERS.set('\u1F50', '\u1F57' + 1); 240 XML_NAME_ALLOWED_CHARACTERS.set('\u1F59'); 241 XML_NAME_ALLOWED_CHARACTERS.set('\u1F5B'); 242 XML_NAME_ALLOWED_CHARACTERS.set('\u1F5D'); 243 XML_NAME_ALLOWED_CHARACTERS.set('\u1F5F', '\u1F7D' + 1); 244 XML_NAME_ALLOWED_CHARACTERS.set('\u1F80', '\u1FB4' + 1); 245 XML_NAME_ALLOWED_CHARACTERS.set('\u1FB6', '\u1FBC' + 1); 246 XML_NAME_ALLOWED_CHARACTERS.set('\u1FBE'); 247 XML_NAME_ALLOWED_CHARACTERS.set('\u1FC2', '\u1FC4' + 1); 248 XML_NAME_ALLOWED_CHARACTERS.set('\u1FC6', '\u1FCC' + 1); 249 XML_NAME_ALLOWED_CHARACTERS.set('\u1FD0', '\u1FD3' + 1); 250 XML_NAME_ALLOWED_CHARACTERS.set('\u1FD6', '\u1FDB' + 1); 251 XML_NAME_ALLOWED_CHARACTERS.set('\u1FE0', '\u1FEC' + 1); 252 XML_NAME_ALLOWED_CHARACTERS.set('\u1FF2', '\u1FF4' + 1); 253 XML_NAME_ALLOWED_CHARACTERS.set('\u1FF6', '\u1FFC' + 1); 254 XML_NAME_ALLOWED_CHARACTERS.set('\u2126'); 255 XML_NAME_ALLOWED_CHARACTERS.set('\u212A', '\u212B' + 1); 256 XML_NAME_ALLOWED_CHARACTERS.set('\u212E'); 257 XML_NAME_ALLOWED_CHARACTERS.set('\u2180', '\u2182' + 1); 258 XML_NAME_ALLOWED_CHARACTERS.set('\u3041', '\u3094' + 1); 259 XML_NAME_ALLOWED_CHARACTERS.set('\u30A1', '\u30FA' + 1); 260 XML_NAME_ALLOWED_CHARACTERS.set('\u3105', '\u312C' + 1); 261 XML_NAME_ALLOWED_CHARACTERS.set('\uAC00', '\uD7A3' + 1); 262 263 // XML Ideograph Character Set 264 265 XML_NAME_ALLOWED_CHARACTERS.set('\u4E00', '\u9FA5' + 1); 266 XML_NAME_ALLOWED_CHARACTERS.set('\u3007'); 267 XML_NAME_ALLOWED_CHARACTERS.set('\u3021', '\u3029' + 1); 268 269 // XML Combining Character Set 270 271 XML_NAME_ALLOWED_CHARACTERS.set('\u0300', '\u0345' + 1); 272 XML_NAME_ALLOWED_CHARACTERS.set('\u0360', '\u0361' + 1); 273 XML_NAME_ALLOWED_CHARACTERS.set('\u0483', '\u0486' + 1); 274 XML_NAME_ALLOWED_CHARACTERS.set('\u0591', '\u05A1' + 1); 275 XML_NAME_ALLOWED_CHARACTERS.set('\u05A3', '\u05B9' + 1); 276 XML_NAME_ALLOWED_CHARACTERS.set('\u05BB', '\u05BD' + 1); 277 XML_NAME_ALLOWED_CHARACTERS.set('\u05BF'); 278 XML_NAME_ALLOWED_CHARACTERS.set('\u05C1', '\u05C2' + 1); 279 XML_NAME_ALLOWED_CHARACTERS.set('\u05C4'); 280 XML_NAME_ALLOWED_CHARACTERS.set('\u064B', '\u0652' + 1); 281 XML_NAME_ALLOWED_CHARACTERS.set('\u0670'); 282 XML_NAME_ALLOWED_CHARACTERS.set('\u06D6', '\u06DC' + 1); 283 XML_NAME_ALLOWED_CHARACTERS.set('\u06DD', '\u06DF' + 1); 284 XML_NAME_ALLOWED_CHARACTERS.set('\u06E0', '\u06E4' + 1); 285 XML_NAME_ALLOWED_CHARACTERS.set('\u06E7', '\u06E8' + 1); 286 XML_NAME_ALLOWED_CHARACTERS.set('\u06EA', '\u06ED' + 1); 287 XML_NAME_ALLOWED_CHARACTERS.set('\u0901', '\u0903' + 1); 288 XML_NAME_ALLOWED_CHARACTERS.set('\u093C'); 289 XML_NAME_ALLOWED_CHARACTERS.set('\u093E', '\u094C' + 1); 290 XML_NAME_ALLOWED_CHARACTERS.set('\u094D'); 291 XML_NAME_ALLOWED_CHARACTERS.set('\u0951', '\u0954' + 1); 292 XML_NAME_ALLOWED_CHARACTERS.set('\u0962', '\u0963' + 1); 293 XML_NAME_ALLOWED_CHARACTERS.set('\u0981', '\u0983' + 1); 294 XML_NAME_ALLOWED_CHARACTERS.set('\u09BC'); 295 XML_NAME_ALLOWED_CHARACTERS.set('\u09BE'); 296 XML_NAME_ALLOWED_CHARACTERS.set('\u09BF'); 297 XML_NAME_ALLOWED_CHARACTERS.set('\u09C0', '\u09C4' + 1); 298 XML_NAME_ALLOWED_CHARACTERS.set('\u09C7', '\u09C8' + 1); 299 XML_NAME_ALLOWED_CHARACTERS.set('\u09CB', '\u09CD' + 1); 300 XML_NAME_ALLOWED_CHARACTERS.set('\u09D7'); 301 XML_NAME_ALLOWED_CHARACTERS.set('\u09E2', '\u09E3' + 1); 302 XML_NAME_ALLOWED_CHARACTERS.set('\u0A02'); 303 XML_NAME_ALLOWED_CHARACTERS.set('\u0A3C'); 304 XML_NAME_ALLOWED_CHARACTERS.set('\u0A3E'); 305 XML_NAME_ALLOWED_CHARACTERS.set('\u0A3F'); 306 XML_NAME_ALLOWED_CHARACTERS.set('\u0A40', '\u0A42' + 1); 307 XML_NAME_ALLOWED_CHARACTERS.set('\u0A47', '\u0A48' + 1); 308 XML_NAME_ALLOWED_CHARACTERS.set('\u0A4B', '\u0A4D' + 1); 309 XML_NAME_ALLOWED_CHARACTERS.set('\u0A70', '\u0A71' + 1); 310 XML_NAME_ALLOWED_CHARACTERS.set('\u0A81', '\u0A83' + 1); 311 XML_NAME_ALLOWED_CHARACTERS.set('\u0ABC'); 312 XML_NAME_ALLOWED_CHARACTERS.set('\u0ABE', '\u0AC5' + 1); 313 XML_NAME_ALLOWED_CHARACTERS.set('\u0AC7', '\u0AC9' + 1); 314 XML_NAME_ALLOWED_CHARACTERS.set('\u0ACB', '\u0ACD' + 1); 315 XML_NAME_ALLOWED_CHARACTERS.set('\u0B01', '\u0B03' + 1); 316 XML_NAME_ALLOWED_CHARACTERS.set('\u0B3C'); 317 XML_NAME_ALLOWED_CHARACTERS.set('\u0B3E', '\u0B43' + 1); 318 XML_NAME_ALLOWED_CHARACTERS.set('\u0B47', '\u0B48' + 1); 319 XML_NAME_ALLOWED_CHARACTERS.set('\u0B4B', '\u0B4D' + 1); 320 XML_NAME_ALLOWED_CHARACTERS.set('\u0B56', '\u0B57' + 1); 321 XML_NAME_ALLOWED_CHARACTERS.set('\u0B82', '\u0B83' + 1); 322 XML_NAME_ALLOWED_CHARACTERS.set('\u0BBE', '\u0BC2' + 1); 323 XML_NAME_ALLOWED_CHARACTERS.set('\u0BC6', '\u0BC8' + 1); 324 XML_NAME_ALLOWED_CHARACTERS.set('\u0BCA', '\u0BCD' + 1); 325 XML_NAME_ALLOWED_CHARACTERS.set('\u0BD7'); 326 XML_NAME_ALLOWED_CHARACTERS.set('\u0C01', '\u0C03' + 1); 327 XML_NAME_ALLOWED_CHARACTERS.set('\u0C3E', '\u0C44' + 1); 328 XML_NAME_ALLOWED_CHARACTERS.set('\u0C46', '\u0C48' + 1); 329 XML_NAME_ALLOWED_CHARACTERS.set('\u0C4A', '\u0C4D' + 1); 330 XML_NAME_ALLOWED_CHARACTERS.set('\u0C55', '\u0C56' + 1); 331 XML_NAME_ALLOWED_CHARACTERS.set('\u0C82', '\u0C83' + 1); 332 XML_NAME_ALLOWED_CHARACTERS.set('\u0CBE', '\u0CC4' + 1); 333 XML_NAME_ALLOWED_CHARACTERS.set('\u0CC6', '\u0CC8' + 1); 334 XML_NAME_ALLOWED_CHARACTERS.set('\u0CCA', '\u0CCD' + 1); 335 XML_NAME_ALLOWED_CHARACTERS.set('\u0CD5', '\u0CD6' + 1); 336 XML_NAME_ALLOWED_CHARACTERS.set('\u0D02', '\u0D03' + 1); 337 XML_NAME_ALLOWED_CHARACTERS.set('\u0D3E', '\u0D43' + 1); 338 XML_NAME_ALLOWED_CHARACTERS.set('\u0D46', '\u0D48' + 1); 339 XML_NAME_ALLOWED_CHARACTERS.set('\u0D4A', '\u0D4D' + 1); 340 XML_NAME_ALLOWED_CHARACTERS.set('\u0D57'); 341 XML_NAME_ALLOWED_CHARACTERS.set('\u0E31'); 342 XML_NAME_ALLOWED_CHARACTERS.set('\u0E34', '\u0E3A' + 1); 343 XML_NAME_ALLOWED_CHARACTERS.set('\u0E47', '\u0E4E' + 1); 344 XML_NAME_ALLOWED_CHARACTERS.set('\u0EB1'); 345 XML_NAME_ALLOWED_CHARACTERS.set('\u0EB4', '\u0EB9' + 1); 346 XML_NAME_ALLOWED_CHARACTERS.set('\u0EBB', '\u0EBC' + 1); 347 XML_NAME_ALLOWED_CHARACTERS.set('\u0EC8', '\u0ECD' + 1); 348 XML_NAME_ALLOWED_CHARACTERS.set('\u0F18', '\u0F19' + 1); 349 XML_NAME_ALLOWED_CHARACTERS.set('\u0F35'); 350 XML_NAME_ALLOWED_CHARACTERS.set('\u0F37'); 351 XML_NAME_ALLOWED_CHARACTERS.set('\u0F39'); 352 XML_NAME_ALLOWED_CHARACTERS.set('\u0F3E'); 353 XML_NAME_ALLOWED_CHARACTERS.set('\u0F3F'); 354 XML_NAME_ALLOWED_CHARACTERS.set('\u0F71', '\u0F84' + 1); 355 XML_NAME_ALLOWED_CHARACTERS.set('\u0F86', '\u0F8B' + 1); 356 XML_NAME_ALLOWED_CHARACTERS.set('\u0F90', '\u0F95' + 1); 357 XML_NAME_ALLOWED_CHARACTERS.set('\u0F97'); 358 XML_NAME_ALLOWED_CHARACTERS.set('\u0F99', '\u0FAD' + 1); 359 XML_NAME_ALLOWED_CHARACTERS.set('\u0FB1', '\u0FB7' + 1); 360 XML_NAME_ALLOWED_CHARACTERS.set('\u0FB9'); 361 XML_NAME_ALLOWED_CHARACTERS.set('\u20D0', '\u20DC' + 1); 362 XML_NAME_ALLOWED_CHARACTERS.set('\u20E1'); 363 XML_NAME_ALLOWED_CHARACTERS.set('\u302A', '\u302F' + 1); 364 XML_NAME_ALLOWED_CHARACTERS.set('\u3099'); 365 XML_NAME_ALLOWED_CHARACTERS.set('\u309A'); 366 367 // XML Digits 368 XML_NAME_ALLOWED_CHARACTERS.set('\u0030', '\u0039' + 1); 369 XML_NAME_ALLOWED_CHARACTERS.set('\u0660', '\u0669' + 1); 370 XML_NAME_ALLOWED_CHARACTERS.set('\u06F0', '\u06F9' + 1); 371 XML_NAME_ALLOWED_CHARACTERS.set('\u0966', '\u096F' + 1); 372 XML_NAME_ALLOWED_CHARACTERS.set('\u09E6', '\u09EF' + 1); 373 XML_NAME_ALLOWED_CHARACTERS.set('\u0A66', '\u0A6F' + 1); 374 XML_NAME_ALLOWED_CHARACTERS.set('\u0AE6', '\u0AEF' + 1); 375 XML_NAME_ALLOWED_CHARACTERS.set('\u0B66', '\u0B6F' + 1); 376 XML_NAME_ALLOWED_CHARACTERS.set('\u0BE7', '\u0BEF' + 1); 377 XML_NAME_ALLOWED_CHARACTERS.set('\u0C66', '\u0C6F' + 1); 378 XML_NAME_ALLOWED_CHARACTERS.set('\u0CE6', '\u0CEF' + 1); 379 XML_NAME_ALLOWED_CHARACTERS.set('\u0D66', '\u0D6F' + 1); 380 XML_NAME_ALLOWED_CHARACTERS.set('\u0E50', '\u0E59' + 1); 381 XML_NAME_ALLOWED_CHARACTERS.set('\u0ED0', '\u0ED9' + 1); 382 XML_NAME_ALLOWED_CHARACTERS.set('\u0F20', '\u0F29' + 1); 383 384 // XML Extenders 385 XML_NAME_ALLOWED_CHARACTERS.set('\u00B7'); 386 XML_NAME_ALLOWED_CHARACTERS.set('\u02D0'); 387 XML_NAME_ALLOWED_CHARACTERS.set('\u02D1'); 388 XML_NAME_ALLOWED_CHARACTERS.set('\u0387'); 389 XML_NAME_ALLOWED_CHARACTERS.set('\u0640'); 390 XML_NAME_ALLOWED_CHARACTERS.set('\u0E46'); 391 XML_NAME_ALLOWED_CHARACTERS.set('\u0EC6'); 392 XML_NAME_ALLOWED_CHARACTERS.set('\u3005'); 393 XML_NAME_ALLOWED_CHARACTERS.set('\u3031', '\u3035' + 1); 394 XML_NAME_ALLOWED_CHARACTERS.set('\u309D', '\u309E' + 1); 395 XML_NAME_ALLOWED_CHARACTERS.set('\u30FC', '\u30FE' + 1); 396 } 397 398 /** 399 * {@inheritDoc} 400 * 401 * @see org.jboss.dna.common.text.TextDecoder#decode(java.lang.String) 402 */ 403 public String decode( String encodedText ) { 404 if (encodedText == null) return null; 405 if (encodedText.length() < 7) { 406 // Not big enough to have an encoded sequence 407 return encodedText; 408 } 409 StringBuilder sb = new StringBuilder(); 410 char[] digits = new char[4]; 411 CharacterIterator iter = new StringCharacterIterator(encodedText); 412 for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { 413 if (c == '_') { 414 // Read the next character, if there is one ... 415 char next = iter.next(); 416 if (next == CharacterIterator.DONE) { 417 sb.append(c); 418 break; 419 } 420 // If the next character is not 'x', then these are just regular characters ... 421 if (next != 'x') { 422 sb.append(c).append(next); 423 continue; 424 } 425 // Read the next 4 characters (digits) and another '_' character ... 426 digits[0] = iter.next(); 427 if (digits[0] == CharacterIterator.DONE) { 428 sb.append(c).append(next); 429 break; 430 } 431 digits[1] = iter.next(); 432 if (digits[1] == CharacterIterator.DONE) { 433 sb.append(c).append(next).append(digits, 0, 1); 434 break; 435 } 436 digits[2] = iter.next(); 437 if (digits[2] == CharacterIterator.DONE) { 438 sb.append(c).append(next).append(digits, 0, 2); 439 break; 440 } 441 digits[3] = iter.next(); 442 if (digits[3] == CharacterIterator.DONE) { 443 sb.append(c).append(next).append(digits, 0, 3); 444 break; 445 } 446 char underscore = iter.next(); 447 if (underscore != '_') { // includes DONE 448 sb.append(c).append(next).append(digits, 0, 4); 449 if (underscore == CharacterIterator.DONE) break; 450 sb.append(underscore); 451 continue; 452 } 453 // We've read all 4 digits, including the trailing '_' 454 // Now parse into the resulting character 455 try { 456 sb.appendCodePoint(Integer.parseInt(new String(digits), 16)); 457 } catch (NumberFormatException e) { 458 // code was not hexadecimal, so just write out the characters as is ... 459 sb.append(c).append(next).append(digits).append(underscore); 460 } 461 } else { 462 // Just append other characters ... 463 sb.append(c); 464 } 465 } 466 return sb.toString(); 467 } 468 469 /** 470 * {@inheritDoc} 471 * 472 * @see org.jboss.dna.common.text.TextEncoder#encode(java.lang.String) 473 */ 474 public String encode( String text ) { 475 if (text == null) return null; 476 if (text.length() == 0) return text; 477 StringBuilder sb = new StringBuilder(); 478 String hex = null; 479 CharacterIterator iter = new StringCharacterIterator(text); 480 for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { 481 if (c == '_') { 482 // Read the next character (if there is one) ... 483 char next = iter.next(); 484 if (next == CharacterIterator.DONE) { 485 sb.append(c); 486 break; 487 } 488 // If the next character is not 'x', then these are just regular characters ... 489 if (next != 'x') { 490 sb.append(c).append(next); 491 continue; 492 } 493 // The next character is 'x', so write out the '_' character in encoded form ... 494 sb.append("_x005f_"); 495 // And then write out the next character ... 496 sb.append(next); 497 } else if (XML_NAME_ALLOWED_CHARACTERS.get(c)) { 498 // Legal characters for an XML Name ... 499 sb.append(c); 500 } else { 501 // All other characters must be escaped with '_xHHHH_' where 'HHHH' is the hex string for the code point 502 hex = Integer.toHexString(c); 503 // The hex string excludes the leading '0's, so check the character values so we know how many to prepend 504 if (c >= '\u0000' && c <= '\u000f') { 505 sb.append("_x000").append(hex); 506 } else if (c >= '\u0010' && c <= '\u00ff') { 507 sb.append("_x00").append(hex); 508 } else if (c >= '\u0100' && c <= '\u0fff') { 509 sb.append("_x0").append(hex); 510 } else { 511 sb.append("_x").append(hex); 512 } 513 sb.append('_'); 514 } 515 } 516 return sb.toString(); 517 } 518 519 }