001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA 010 * is licensed to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 package org.jboss.dna.common.text; 025 026 import java.util.HashSet; 027 import java.util.LinkedList; 028 import java.util.Set; 029 import java.util.regex.Matcher; 030 import java.util.regex.Pattern; 031 import org.jboss.dna.common.util.CheckArg; 032 033 /** 034 * Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by 035 * the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a 036 * href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a 037 * href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>. 038 * 039 * @author Randall Hauch 040 */ 041 public class Inflector { 042 043 protected static final Inflector INSTANCE = new Inflector(); 044 045 public static final Inflector getInstance() { 046 return INSTANCE; 047 } 048 049 protected class Rule { 050 051 protected final String expression; 052 protected final Pattern expressionPattern; 053 protected final String replacement; 054 055 protected Rule( String expression, 056 String replacement ) { 057 this.expression = expression; 058 this.replacement = replacement != null ? replacement : ""; 059 this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE); 060 } 061 062 /** 063 * Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no 064 * modifications were made) 065 * 066 * @param input the input string 067 * @return the modified string if this rule applied, or null if the input was not modified by this rule 068 */ 069 protected String apply( String input ) { 070 Matcher matcher = this.expressionPattern.matcher(input); 071 if (!matcher.find()) return null; 072 return matcher.replaceAll(this.replacement); 073 } 074 075 @Override 076 public int hashCode() { 077 return expression.hashCode(); 078 } 079 080 @Override 081 public boolean equals( Object obj ) { 082 if (obj == this) return true; 083 if (obj != null && obj.getClass() == this.getClass()) { 084 final Rule that = (Rule)obj; 085 if (this.expression.equalsIgnoreCase(that.expression)) return true; 086 } 087 return false; 088 } 089 090 @Override 091 public String toString() { 092 return expression + ", " + replacement; 093 } 094 } 095 096 private LinkedList<Rule> plurals = new LinkedList<Rule>(); 097 private LinkedList<Rule> singulars = new LinkedList<Rule>(); 098 /** 099 * The lowercase words that are to be excluded and not processed. This map can be modified by the users via 100 * {@link #getUncountables()}. 101 */ 102 private final Set<String> uncountables = new HashSet<String>(); 103 104 public Inflector() { 105 initialize(); 106 } 107 108 protected Inflector( Inflector original ) { 109 this.plurals.addAll(original.plurals); 110 this.singulars.addAll(original.singulars); 111 this.uncountables.addAll(original.uncountables); 112 } 113 114 @Override 115 public Inflector clone() { 116 return new Inflector(this); 117 } 118 119 // ------------------------------------------------------------------------------------------------ 120 // Usage functions 121 // ------------------------------------------------------------------------------------------------ 122 123 /** 124 * Returns the plural form of the word in the string. 125 * <p> 126 * Examples: 127 * 128 * <pre> 129 * inflector.pluralize("post") #=> "posts" 130 * inflector.pluralize("octopus") #=> "octopi" 131 * inflector.pluralize("sheep") #=> "sheep" 132 * inflector.pluralize("words") #=> "words" 133 * inflector.pluralize("the blue mailman") #=> "the blue mailmen" 134 * inflector.pluralize("CamelOctopus") #=> "CamelOctopi" 135 * </pre> 136 * 137 * </p> 138 * <p> 139 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too. 140 * </p> 141 * 142 * @param word the word that is to be pluralized. 143 * @return the pluralized form of the word, or the word itself if it could not be pluralized 144 * @see #singularize(Object) 145 */ 146 public String pluralize( Object word ) { 147 if (word == null) return null; 148 String wordStr = word.toString().trim(); 149 if (wordStr.length() == 0) return wordStr; 150 if (isUncountable(wordStr)) return wordStr; 151 for (Rule rule : this.plurals) { 152 String result = rule.apply(wordStr); 153 if (result != null) return result; 154 } 155 return wordStr; 156 } 157 158 public String pluralize( Object word, 159 int count ) { 160 if (word == null) return null; 161 if (count == 1 || count == -1) { 162 return word.toString(); 163 } 164 return pluralize(word); 165 } 166 167 /** 168 * Returns the singular form of the word in the string. 169 * <p> 170 * Examples: 171 * 172 * <pre> 173 * inflector.singularize("posts") #=> "post" 174 * inflector.singularize("octopi") #=> "octopus" 175 * inflector.singularize("sheep") #=> "sheep" 176 * inflector.singularize("words") #=> "word" 177 * inflector.singularize("the blue mailmen") #=> "the blue mailman" 178 * inflector.singularize("CamelOctopi") #=> "CamelOctopus" 179 * </pre> 180 * 181 * </p> 182 * <p> 183 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too. 184 * </p> 185 * 186 * @param word the word that is to be pluralized. 187 * @return the pluralized form of the word, or the word itself if it could not be pluralized 188 * @see #pluralize(Object) 189 */ 190 public String singularize( Object word ) { 191 if (word == null) return null; 192 String wordStr = word.toString().trim(); 193 if (wordStr.length() == 0) return wordStr; 194 if (isUncountable(wordStr)) return wordStr; 195 for (Rule rule : this.singulars) { 196 String result = rule.apply(wordStr); 197 if (result != null) return result; 198 } 199 return wordStr; 200 } 201 202 /** 203 * Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries. 204 * <p> 205 * Examples: 206 * 207 * <pre> 208 * inflector.lowerCamelCase("active_record") #=> "activeRecord" 209 * inflector.lowerCamelCase("first_name") #=> "firstName" 210 * inflector.lowerCamelCase("name") #=> "name" 211 * inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName" 212 * </pre> 213 * 214 * </p> 215 * 216 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 217 * @param delimiterChars optional characters that are used to delimit word boundaries 218 * @return the lower camel case version of the word 219 * @see #underscore(String, char[]) 220 * @see #camelCase(String, boolean, char[]) 221 * @see #upperCamelCase(String, char[]) 222 */ 223 public String lowerCamelCase( String lowerCaseAndUnderscoredWord, 224 char... delimiterChars ) { 225 return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars); 226 } 227 228 /** 229 * Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries. 230 * <p> 231 * Examples: 232 * 233 * <pre> 234 * inflector.upperCamelCase("active_record") #=> "SctiveRecord" 235 * inflector.upperCamelCase("first_name") #=> "FirstName" 236 * inflector.upperCamelCase("name") #=> "Name" 237 * inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName" 238 * </pre> 239 * 240 * </p> 241 * 242 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 243 * @param delimiterChars optional characters that are used to delimit word boundaries 244 * @return the upper camel case version of the word 245 * @see #underscore(String, char[]) 246 * @see #camelCase(String, boolean, char[]) 247 * @see #lowerCamelCase(String, char[]) 248 */ 249 public String upperCamelCase( String lowerCaseAndUnderscoredWord, 250 char... delimiterChars ) { 251 return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars); 252 } 253 254 /** 255 * By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false, 256 * then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word 257 * boundaries. 258 * <p> 259 * Examples: 260 * 261 * <pre> 262 * inflector.camelCase("active_record",false) #=> "activeRecord" 263 * inflector.camelCase("active_record",true) #=> "ActiveRecord" 264 * inflector.camelCase("first_name",false) #=> "firstName" 265 * inflector.camelCase("first_name",true) #=> "FirstName" 266 * inflector.camelCase("name",false) #=> "name" 267 * inflector.camelCase("name",true) #=> "Name" 268 * </pre> 269 * 270 * </p> 271 * 272 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 273 * @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be 274 * lowercased 275 * @param delimiterChars optional characters that are used to delimit word boundaries 276 * @return the camel case version of the word 277 * @see #underscore(String, char[]) 278 * @see #upperCamelCase(String, char[]) 279 * @see #lowerCamelCase(String, char[]) 280 */ 281 public String camelCase( String lowerCaseAndUnderscoredWord, 282 boolean uppercaseFirstLetter, 283 char... delimiterChars ) { 284 if (lowerCaseAndUnderscoredWord == null) return null; 285 lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim(); 286 if (lowerCaseAndUnderscoredWord.length() == 0) return ""; 287 if (uppercaseFirstLetter) { 288 String result = lowerCaseAndUnderscoredWord; 289 // Replace any extra delimiters with underscores (before the underscores are converted in the next step)... 290 if (delimiterChars != null) { 291 for (char delimiterChar : delimiterChars) { 292 result = result.replace(delimiterChar, '_'); 293 } 294 } 295 296 // Change the case at the beginning at after each underscore ... 297 return replaceAllWithUppercase(result, "(^|_)(.)", 2); 298 } 299 if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord; 300 return "" + Character.toLowerCase(lowerCaseAndUnderscoredWord.charAt(0)) 301 + camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1); 302 } 303 304 /** 305 * Makes an underscored form from the expression in the string (the reverse of the {@link #camelCase(String, boolean, char[]) 306 * camelCase} method. Also changes any characters that match the supplied delimiters into underscore. 307 * <p> 308 * Examples: 309 * 310 * <pre> 311 * inflector.underscore("activeRecord") #=> "active_record" 312 * inflector.underscore("ActiveRecord") #=> "active_record" 313 * inflector.underscore("firstName") #=> "first_name" 314 * inflector.underscore("FirstName") #=> "first_name" 315 * inflector.underscore("name") #=> "name" 316 * inflector.underscore("The.firstName") #=> "the_first_name" 317 * </pre> 318 * 319 * </p> 320 * 321 * @param camelCaseWord the camel-cased word that is to be converted; 322 * @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization) 323 * @return a lower-cased version of the input, with separate words delimited by the underscore character. 324 */ 325 public String underscore( String camelCaseWord, 326 char... delimiterChars ) { 327 if (camelCaseWord == null) return null; 328 String result = camelCaseWord.trim(); 329 if (result.length() == 0) return ""; 330 result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "$1_$2"); 331 result = result.replaceAll("([a-z\\d])([A-Z])", "$1_$2"); 332 result = result.replace('-', '_'); 333 if (delimiterChars != null) { 334 for (char delimiterChar : delimiterChars) { 335 result = result.replace(delimiterChar, '_'); 336 } 337 } 338 return result.toLowerCase(); 339 } 340 341 /** 342 * Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase. 343 * 344 * @param words the word to be capitalized 345 * @return the string with the first character capitalized and the remaining characters lowercased 346 */ 347 public String capitalize( String words ) { 348 if (words == null) return null; 349 String result = words.trim(); 350 if (result.length() == 0) return ""; 351 if (result.length() == 1) return result.toUpperCase(); 352 return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase(); 353 } 354 355 /** 356 * Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens. 357 * Like {@link #titleCase(String, String[])}, this is meant for creating pretty output. 358 * <p> 359 * Examples: 360 * 361 * <pre> 362 * inflector.humanize("employee_salary") #=> "Employee salary" 363 * inflector.humanize("author_id") #=> "Author" 364 * </pre> 365 * 366 * </p> 367 * 368 * @param lowerCaseAndUnderscoredWords the input to be humanized 369 * @param removableTokens optional array of tokens that are to be removed 370 * @return the humanized string 371 * @see #titleCase(String, String[]) 372 */ 373 public String humanize( String lowerCaseAndUnderscoredWords, 374 String... removableTokens ) { 375 if (lowerCaseAndUnderscoredWords == null) return null; 376 String result = lowerCaseAndUnderscoredWords.trim(); 377 if (result.length() == 0) return ""; 378 // Remove a trailing "_id" token 379 result = result.replaceAll("_id$", ""); 380 // Remove all of the tokens that should be removed 381 if (removableTokens != null) { 382 for (String removableToken : removableTokens) { 383 result = result.replaceAll(removableToken, ""); 384 } 385 } 386 result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space 387 return capitalize(result); 388 } 389 390 /** 391 * Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are 392 * changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like 393 * {@link #humanize(String, String[])}, this is meant for creating pretty output. 394 * <p> 395 * Examples: 396 * 397 * <pre> 398 * inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks" 399 * inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand" 400 * </pre> 401 * 402 * </p> 403 * 404 * @param words the input to be turned into title case 405 * @param removableTokens optional array of tokens that are to be removed 406 * @return the title-case version of the supplied words 407 */ 408 public String titleCase( String words, 409 String... removableTokens ) { 410 String result = humanize(words, removableTokens); 411 result = replaceAllWithUppercase(result, "\\b([a-z])", 1); // change first char of each word to uppercase 412 return result; 413 } 414 415 /** 416 * Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd, 417 * 3rd, 4th. 418 * 419 * @param number the non-negative number 420 * @return the string with the number and ordinal suffix 421 */ 422 public String ordinalize( int number ) { 423 int remainder = number % 100; 424 String numberStr = Integer.toString(number); 425 if (11 <= number && number <= 13) return numberStr + "th"; 426 remainder = number % 10; 427 if (remainder == 1) return numberStr + "st"; 428 if (remainder == 2) return numberStr + "nd"; 429 if (remainder == 3) return numberStr + "rd"; 430 return numberStr + "th"; 431 } 432 433 // ------------------------------------------------------------------------------------------------ 434 // Management methods 435 // ------------------------------------------------------------------------------------------------ 436 437 /** 438 * Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and 439 * {@link #singularize(Object) singularize} methods. 440 * 441 * @param word the word 442 * @return true if the plural and singular forms of the word are the same 443 */ 444 public boolean isUncountable( String word ) { 445 if (word == null) return false; 446 String trimmedLower = word.trim().toLowerCase(); 447 return this.uncountables.contains(trimmedLower); 448 } 449 450 /** 451 * Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable. 452 * 453 * @return the set of uncountable words 454 */ 455 public Set<String> getUncountables() { 456 return uncountables; 457 } 458 459 public void addPluralize( String rule, 460 String replacement ) { 461 final Rule pluralizeRule = new Rule(rule, replacement); 462 this.plurals.addFirst(pluralizeRule); 463 } 464 465 public void addSingularize( String rule, 466 String replacement ) { 467 final Rule singularizeRule = new Rule(rule, replacement); 468 this.singulars.addFirst(singularizeRule); 469 } 470 471 public void addIrregular( String singular, 472 String plural ) { 473 CheckArg.isNotEmpty(singular, "singular rule"); 474 CheckArg.isNotEmpty(plural, "plural rule"); 475 String singularRemainder = singular.length() > 1 ? singular.substring(1) : ""; 476 String pluralRemainder = plural.length() > 1 ? plural.substring(1) : ""; 477 addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "$1" + pluralRemainder); 478 addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "$1" + singularRemainder); 479 } 480 481 public void addUncountable( String... words ) { 482 if (words == null || words.length == 0) return; 483 for (String word : words) { 484 if (word != null) uncountables.add(word.trim().toLowerCase()); 485 } 486 } 487 488 /** 489 * Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all 490 * other backreferences. 491 * <p> 492 * The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>, 493 * <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement string 494 * to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first backreference, and 495 * <code>\u3</code> would uppercase the 3rd backreference. 496 * </p> 497 * 498 * @param input 499 * @param regex 500 * @param groupNumberToUppercase 501 * @return the input string with the appropriate characters converted to upper-case 502 */ 503 protected static String replaceAllWithUppercase( String input, 504 String regex, 505 int groupNumberToUppercase ) { 506 Pattern underscoreAndDotPattern = Pattern.compile(regex); 507 Matcher matcher = underscoreAndDotPattern.matcher(input); 508 StringBuffer sb = new StringBuffer(); 509 while (matcher.find()) { 510 matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase()); 511 } 512 matcher.appendTail(sb); 513 return sb.toString(); 514 } 515 516 /** 517 * Completely remove all rules within this inflector. 518 */ 519 public void clear() { 520 this.uncountables.clear(); 521 this.plurals.clear(); 522 this.singulars.clear(); 523 } 524 525 protected void initialize() { 526 Inflector inflect = this; 527 inflect.addPluralize("$", "s"); 528 inflect.addPluralize("s$", "s"); 529 inflect.addPluralize("(ax|test)is$", "$1es"); 530 inflect.addPluralize("(octop|vir)us$", "$1i"); 531 inflect.addPluralize("(octop|vir)i$", "$1i"); // already plural 532 inflect.addPluralize("(alias|status)$", "$1es"); 533 inflect.addPluralize("(bu)s$", "$1ses"); 534 inflect.addPluralize("(buffal|tomat)o$", "$1oes"); 535 inflect.addPluralize("([ti])um$", "$1a"); 536 inflect.addPluralize("([ti])a$", "$1a"); // already plural 537 inflect.addPluralize("sis$", "ses"); 538 inflect.addPluralize("(?:([^f])fe|([lr])f)$", "$1$2ves"); 539 inflect.addPluralize("(hive)$", "$1s"); 540 inflect.addPluralize("([^aeiouy]|qu)y$", "$1ies"); 541 inflect.addPluralize("(x|ch|ss|sh)$", "$1es"); 542 inflect.addPluralize("(matr|vert|ind)ix|ex$", "$1ices"); 543 inflect.addPluralize("([m|l])ouse$", "$1ice"); 544 inflect.addPluralize("([m|l])ice$", "$1ice"); 545 inflect.addPluralize("^(ox)$", "$1en"); 546 inflect.addPluralize("(quiz)$", "$1zes"); 547 // Need to check for the following words that are already pluralized: 548 inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", "$1"); // irregulars 549 inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", "$1"); // special rules 550 551 inflect.addSingularize("s$", ""); 552 inflect.addSingularize("(s|si|u)s$", "$1s"); // '-us' and '-ss' are already singular 553 inflect.addSingularize("(n)ews$", "$1ews"); 554 inflect.addSingularize("([ti])a$", "$1um"); 555 inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis"); 556 inflect.addSingularize("(^analy)ses$", "$1sis"); 557 inflect.addSingularize("(^analy)sis$", "$1sis"); // already singular, but ends in 's' 558 inflect.addSingularize("([^f])ves$", "$1fe"); 559 inflect.addSingularize("(hive)s$", "$1"); 560 inflect.addSingularize("(tive)s$", "$1"); 561 inflect.addSingularize("([lr])ves$", "$1f"); 562 inflect.addSingularize("([^aeiouy]|qu)ies$", "$1y"); 563 inflect.addSingularize("(s)eries$", "$1eries"); 564 inflect.addSingularize("(m)ovies$", "$1ovie"); 565 inflect.addSingularize("(x|ch|ss|sh)es$", "$1"); 566 inflect.addSingularize("([m|l])ice$", "$1ouse"); 567 inflect.addSingularize("(bus)es$", "$1"); 568 inflect.addSingularize("(o)es$", "$1"); 569 inflect.addSingularize("(shoe)s$", "$1"); 570 inflect.addSingularize("(cris|ax|test)is$", "$1is"); // already singular, but ends in 's' 571 inflect.addSingularize("(cris|ax|test)es$", "$1is"); 572 inflect.addSingularize("(octop|vir)i$", "$1us"); 573 inflect.addSingularize("(octop|vir)us$", "$1us"); // already singular, but ends in 's' 574 inflect.addSingularize("(alias|status)es$", "$1"); 575 inflect.addSingularize("(alias|status)$", "$1"); // already singular, but ends in 's' 576 inflect.addSingularize("^(ox)en", "$1"); 577 inflect.addSingularize("(vert|ind)ices$", "$1ex"); 578 inflect.addSingularize("(matr)ices$", "$1ix"); 579 inflect.addSingularize("(quiz)zes$", "$1"); 580 581 inflect.addIrregular("person", "people"); 582 inflect.addIrregular("man", "men"); 583 inflect.addIrregular("child", "children"); 584 inflect.addIrregular("sex", "sexes"); 585 inflect.addIrregular("move", "moves"); 586 inflect.addIrregular("stadium", "stadiums"); 587 588 inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep"); 589 } 590 591 }