001 /*
002 * JBoss, Home of Professional Open Source.
003 * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004 * as indicated by the @author tags. See the copyright.txt file in the
005 * distribution for a full listing of individual contributors.
006 *
007 * This is free software; you can redistribute it and/or modify it
008 * under the terms of the GNU Lesser General Public License as
009 * published by the Free Software Foundation; either version 2.1 of
010 * the License, or (at your option) any later version.
011 *
012 * This software is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * You should have received a copy of the GNU Lesser General Public
018 * License along with this software; if not, write to the Free
019 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021 */
022 package org.jboss.dna.common.text;
023
024 import java.util.HashSet;
025 import java.util.LinkedList;
026 import java.util.Set;
027 import java.util.regex.Matcher;
028 import java.util.regex.Pattern;
029 import org.jboss.dna.common.util.CheckArg;
030
031 /**
032 * Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by
033 * the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a
034 * href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a
035 * href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>.
036 *
037 * @author Randall Hauch
038 */
039 public class Inflector {
040
041 protected static final Inflector INSTANCE = new Inflector();
042
043 public static final Inflector getInstance() {
044 return INSTANCE;
045 }
046
047 protected class Rule {
048
049 protected final String expression;
050 protected final Pattern expressionPattern;
051 protected final String replacement;
052
053 protected Rule( String expression,
054 String replacement ) {
055 this.expression = expression;
056 this.replacement = replacement != null ? replacement : "";
057 this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE);
058 }
059
060 /**
061 * Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no
062 * modifications were made)
063 *
064 * @param input the input string
065 * @return the modified string if this rule applied, or null if the input was not modified by this rule
066 */
067 protected String apply( String input ) {
068 Matcher matcher = this.expressionPattern.matcher(input);
069 if (!matcher.find()) return null;
070 return matcher.replaceAll(this.replacement);
071 }
072
073 @Override
074 public int hashCode() {
075 return expression.hashCode();
076 }
077
078 @Override
079 public boolean equals( Object obj ) {
080 if (obj == this) return true;
081 if (obj != null && obj.getClass() == this.getClass()) {
082 final Rule that = (Rule)obj;
083 if (this.expression.equalsIgnoreCase(that.expression)) return true;
084 }
085 return false;
086 }
087
088 @Override
089 public String toString() {
090 return expression + ", " + replacement;
091 }
092 }
093
094 private LinkedList<Rule> plurals = new LinkedList<Rule>();
095 private LinkedList<Rule> singulars = new LinkedList<Rule>();
096 /**
097 * The lowercase words that are to be excluded and not processed. This map can be modified by the users via
098 * {@link #getUncountables()}.
099 */
100 private final Set<String> uncountables = new HashSet<String>();
101
102 public Inflector() {
103 initialize();
104 }
105
106 protected Inflector( Inflector original ) {
107 this.plurals.addAll(original.plurals);
108 this.singulars.addAll(original.singulars);
109 this.uncountables.addAll(original.uncountables);
110 }
111
112 @Override
113 public Inflector clone() {
114 return new Inflector(this);
115 }
116
117 // ------------------------------------------------------------------------------------------------
118 // Usage functions
119 // ------------------------------------------------------------------------------------------------
120
121 /**
122 * Returns the plural form of the word in the string.
123 * <p>
124 * Examples:
125 *
126 * <pre>
127 * inflector.pluralize("post") #=> "posts"
128 * inflector.pluralize("octopus") #=> "octopi"
129 * inflector.pluralize("sheep") #=> "sheep"
130 * inflector.pluralize("words") #=> "words"
131 * inflector.pluralize("the blue mailman") #=> "the blue mailmen"
132 * inflector.pluralize("CamelOctopus") #=> "CamelOctopi"
133 * </pre>
134 *
135 * </p>
136 * <p>
137 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
138 * </p>
139 *
140 * @param word the word that is to be pluralized.
141 * @return the pluralized form of the word, or the word itself if it could not be pluralized
142 * @see #singularize(Object)
143 */
144 public String pluralize( Object word ) {
145 if (word == null) return null;
146 String wordStr = word.toString().trim();
147 if (wordStr.length() == 0) return wordStr;
148 if (isUncountable(wordStr)) return wordStr;
149 for (Rule rule : this.plurals) {
150 String result = rule.apply(wordStr);
151 if (result != null) return result;
152 }
153 return wordStr;
154 }
155
156 public String pluralize( Object word,
157 int count ) {
158 if (word == null) return null;
159 if (count == 1 || count == -1) {
160 return word.toString();
161 }
162 return pluralize(word);
163 }
164
165 /**
166 * Returns the singular form of the word in the string.
167 * <p>
168 * Examples:
169 *
170 * <pre>
171 * inflector.singularize("posts") #=> "post"
172 * inflector.singularize("octopi") #=> "octopus"
173 * inflector.singularize("sheep") #=> "sheep"
174 * inflector.singularize("words") #=> "word"
175 * inflector.singularize("the blue mailmen") #=> "the blue mailman"
176 * inflector.singularize("CamelOctopi") #=> "CamelOctopus"
177 * </pre>
178 *
179 * </p>
180 * <p>
181 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
182 * </p>
183 *
184 * @param word the word that is to be pluralized.
185 * @return the pluralized form of the word, or the word itself if it could not be pluralized
186 * @see #pluralize(Object)
187 */
188 public String singularize( Object word ) {
189 if (word == null) return null;
190 String wordStr = word.toString().trim();
191 if (wordStr.length() == 0) return wordStr;
192 if (isUncountable(wordStr)) return wordStr;
193 for (Rule rule : this.singulars) {
194 String result = rule.apply(wordStr);
195 if (result != null) return result;
196 }
197 return wordStr;
198 }
199
200 /**
201 * Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
202 * <p>
203 * Examples:
204 *
205 * <pre>
206 * inflector.lowerCamelCase("active_record") #=> "activeRecord"
207 * inflector.lowerCamelCase("first_name") #=> "firstName"
208 * inflector.lowerCamelCase("name") #=> "name"
209 * inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName"
210 * </pre>
211 *
212 * </p>
213 *
214 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
215 * @param delimiterChars optional characters that are used to delimit word boundaries
216 * @return the lower camel case version of the word
217 * @see #underscore(String, char[])
218 * @see #camelCase(String, boolean, char[])
219 * @see #upperCamelCase(String, char[])
220 */
221 public String lowerCamelCase( String lowerCaseAndUnderscoredWord,
222 char... delimiterChars ) {
223 return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars);
224 }
225
226 /**
227 * Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
228 * <p>
229 * Examples:
230 *
231 * <pre>
232 * inflector.upperCamelCase("active_record") #=> "SctiveRecord"
233 * inflector.upperCamelCase("first_name") #=> "FirstName"
234 * inflector.upperCamelCase("name") #=> "Name"
235 * inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName"
236 * </pre>
237 *
238 * </p>
239 *
240 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
241 * @param delimiterChars optional characters that are used to delimit word boundaries
242 * @return the upper camel case version of the word
243 * @see #underscore(String, char[])
244 * @see #camelCase(String, boolean, char[])
245 * @see #lowerCamelCase(String, char[])
246 */
247 public String upperCamelCase( String lowerCaseAndUnderscoredWord,
248 char... delimiterChars ) {
249 return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars);
250 }
251
252 /**
253 * By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false,
254 * then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word
255 * boundaries.
256 * <p>
257 * Examples:
258 *
259 * <pre>
260 * inflector.camelCase("active_record",false) #=> "activeRecord"
261 * inflector.camelCase("active_record",true) #=> "ActiveRecord"
262 * inflector.camelCase("first_name",false) #=> "firstName"
263 * inflector.camelCase("first_name",true) #=> "FirstName"
264 * inflector.camelCase("name",false) #=> "name"
265 * inflector.camelCase("name",true) #=> "Name"
266 * </pre>
267 *
268 * </p>
269 *
270 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
271 * @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be
272 * lowercased
273 * @param delimiterChars optional characters that are used to delimit word boundaries
274 * @return the camel case version of the word
275 * @see #underscore(String, char[])
276 * @see #upperCamelCase(String, char[])
277 * @see #lowerCamelCase(String, char[])
278 */
279 public String camelCase( String lowerCaseAndUnderscoredWord,
280 boolean uppercaseFirstLetter,
281 char... delimiterChars ) {
282 if (lowerCaseAndUnderscoredWord == null) return null;
283 lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim();
284 if (lowerCaseAndUnderscoredWord.length() == 0) return "";
285 if (uppercaseFirstLetter) {
286 String result = lowerCaseAndUnderscoredWord;
287 // Replace any extra delimiters with underscores (before the underscores are converted in the next step)...
288 if (delimiterChars != null) {
289 for (char delimiterChar : delimiterChars) {
290 result = result.replace(delimiterChar, '_');
291 }
292 }
293
294 // Change the case at the beginning at after each underscore ...
295 return replaceAllWithUppercase(result, "(^|_)(.)", 2);
296 }
297 if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord;
298 return "" + lowerCaseAndUnderscoredWord.charAt(0)
299 + camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1);
300 }
301
302 /**
303 * Makes an underscored form from the expression in the string (the reverse of the
304 * {@link #camelCase(String, boolean, char[]) camelCase} method. Also changes any characters that match the supplied
305 * delimiters into underscore.
306 * <p>
307 * Examples:
308 *
309 * <pre>
310 * inflector.underscore("activeRecord") #=> "active_record"
311 * inflector.underscore("ActiveRecord") #=> "active_record"
312 * inflector.underscore("firstName") #=> "first_name"
313 * inflector.underscore("FirstName") #=> "first_name"
314 * inflector.underscore("name") #=> "name"
315 * inflector.underscore("The.firstName") #=> "the_first_name"
316 * </pre>
317 *
318 * </p>
319 *
320 * @param camelCaseWord the camel-cased word that is to be converted;
321 * @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization)
322 * @return a lower-cased version of the input, with separate words delimited by the underscore character.
323 */
324 public String underscore( String camelCaseWord,
325 char... delimiterChars ) {
326 if (camelCaseWord == null) return null;
327 String result = camelCaseWord.trim();
328 if (result.length() == 0) return "";
329 result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "$1_$2");
330 result = result.replaceAll("([a-z\\d])([A-Z])", "$1_$2");
331 result = result.replace('-', '_');
332 if (delimiterChars != null) {
333 for (char delimiterChar : delimiterChars) {
334 result = result.replace(delimiterChar, '_');
335 }
336 }
337 return result.toLowerCase();
338 }
339
340 /**
341 * Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase.
342 *
343 * @param words the word to be capitalized
344 * @return the string with the first character capitalized and the remaining characters lowercased
345 */
346 public String capitalize( String words ) {
347 if (words == null) return null;
348 String result = words.trim();
349 if (result.length() == 0) return "";
350 if (result.length() == 1) return result.toUpperCase();
351 return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase();
352 }
353
354 /**
355 * Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens.
356 * Like {@link #titleCase(String, String[])}, this is meant for creating pretty output.
357 * <p>
358 * Examples:
359 *
360 * <pre>
361 * inflector.humanize("employee_salary") #=> "Employee salary"
362 * inflector.humanize("author_id") #=> "Author"
363 * </pre>
364 *
365 * </p>
366 *
367 * @param lowerCaseAndUnderscoredWords the input to be humanized
368 * @param removableTokens optional array of tokens that are to be removed
369 * @return the humanized string
370 * @see #titleCase(String, String[])
371 */
372 public String humanize( String lowerCaseAndUnderscoredWords,
373 String... removableTokens ) {
374 if (lowerCaseAndUnderscoredWords == null) return null;
375 String result = lowerCaseAndUnderscoredWords.trim();
376 if (result.length() == 0) return "";
377 // Remove a trailing "_id" token
378 result = result.replaceAll("_id$", "");
379 // Remove all of the tokens that should be removed
380 if (removableTokens != null) {
381 for (String removableToken : removableTokens) {
382 result = result.replaceAll(removableToken, "");
383 }
384 }
385 result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space
386 return capitalize(result);
387 }
388
389 /**
390 * Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are
391 * changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like
392 * {@link #humanize(String, String[])}, this is meant for creating pretty output.
393 * <p>
394 * Examples:
395 *
396 * <pre>
397 * inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks"
398 * inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand"
399 * </pre>
400 *
401 * </p>
402 *
403 * @param words the input to be turned into title case
404 * @param removableTokens optional array of tokens that are to be removed
405 * @return the title-case version of the supplied words
406 */
407 public String titleCase( String words,
408 String... removableTokens ) {
409 String result = humanize(words, removableTokens);
410 result = replaceAllWithUppercase(result, "\\b([a-z])", 1); // change first char of each word to uppercase
411 return result;
412 }
413
414 /**
415 * Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd,
416 * 3rd, 4th.
417 *
418 * @param number the non-negative number
419 * @return the string with the number and ordinal suffix
420 */
421 public String ordinalize( int number ) {
422 int remainder = number % 100;
423 String numberStr = Integer.toString(number);
424 if (11 <= number && number <= 13) return numberStr + "th";
425 remainder = number % 10;
426 if (remainder == 1) return numberStr + "st";
427 if (remainder == 2) return numberStr + "nd";
428 if (remainder == 3) return numberStr + "rd";
429 return numberStr + "th";
430 }
431
432 // ------------------------------------------------------------------------------------------------
433 // Management methods
434 // ------------------------------------------------------------------------------------------------
435
436 /**
437 * Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and
438 * {@link #singularize(Object) singularize} methods.
439 *
440 * @param word the word
441 * @return true if the plural and singular forms of the word are the same
442 */
443 public boolean isUncountable( String word ) {
444 if (word == null) return false;
445 String trimmedLower = word.trim().toLowerCase();
446 return this.uncountables.contains(trimmedLower);
447 }
448
449 /**
450 * Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable.
451 *
452 * @return the set of uncountable words
453 */
454 public Set<String> getUncountables() {
455 return uncountables;
456 }
457
458 public void addPluralize( String rule,
459 String replacement ) {
460 final Rule pluralizeRule = new Rule(rule, replacement);
461 this.plurals.addFirst(pluralizeRule);
462 }
463
464 public void addSingularize( String rule,
465 String replacement ) {
466 final Rule singularizeRule = new Rule(rule, replacement);
467 this.singulars.addFirst(singularizeRule);
468 }
469
470 public void addIrregular( String singular,
471 String plural ) {
472 CheckArg.isNotEmpty(singular, "singular rule");
473 CheckArg.isNotEmpty(plural, "plural rule");
474 String singularRemainder = singular.length() > 1 ? singular.substring(1) : "";
475 String pluralRemainder = plural.length() > 1 ? plural.substring(1) : "";
476 addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "$1" + pluralRemainder);
477 addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "$1" + singularRemainder);
478 }
479
480 public void addUncountable( String... words ) {
481 if (words == null || words.length == 0) return;
482 for (String word : words) {
483 if (word != null) uncountables.add(word.trim().toLowerCase());
484 }
485 }
486
487 /**
488 * Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all
489 * other backreferences.
490 * <p>
491 * The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>,
492 * <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement
493 * string to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first
494 * backreference, and <code>\u3</code> would uppercase the 3rd backreference.
495 * </p>
496 *
497 * @param input
498 * @param regex
499 * @param groupNumberToUppercase
500 * @return the input string with the appropriate characters converted to upper-case
501 */
502 protected static String replaceAllWithUppercase( String input,
503 String regex,
504 int groupNumberToUppercase ) {
505 Pattern underscoreAndDotPattern = Pattern.compile(regex);
506 Matcher matcher = underscoreAndDotPattern.matcher(input);
507 StringBuffer sb = new StringBuffer();
508 while (matcher.find()) {
509 matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase());
510 }
511 matcher.appendTail(sb);
512 return sb.toString();
513 }
514
515 /**
516 * Completely remove all rules within this inflector.
517 */
518 public void clear() {
519 this.uncountables.clear();
520 this.plurals.clear();
521 this.singulars.clear();
522 }
523
524 protected void initialize() {
525 Inflector inflect = this;
526 inflect.addPluralize("$", "s");
527 inflect.addPluralize("s$", "s");
528 inflect.addPluralize("(ax|test)is$", "$1es");
529 inflect.addPluralize("(octop|vir)us$", "$1i");
530 inflect.addPluralize("(octop|vir)i$", "$1i"); // already plural
531 inflect.addPluralize("(alias|status)$", "$1es");
532 inflect.addPluralize("(bu)s$", "$1ses");
533 inflect.addPluralize("(buffal|tomat)o$", "$1oes");
534 inflect.addPluralize("([ti])um$", "$1a");
535 inflect.addPluralize("([ti])a$", "$1a"); // already plural
536 inflect.addPluralize("sis$", "ses");
537 inflect.addPluralize("(?:([^f])fe|([lr])f)$", "$1$2ves");
538 inflect.addPluralize("(hive)$", "$1s");
539 inflect.addPluralize("([^aeiouy]|qu)y$", "$1ies");
540 inflect.addPluralize("(x|ch|ss|sh)$", "$1es");
541 inflect.addPluralize("(matr|vert|ind)ix|ex$", "$1ices");
542 inflect.addPluralize("([m|l])ouse$", "$1ice");
543 inflect.addPluralize("([m|l])ice$", "$1ice");
544 inflect.addPluralize("^(ox)$", "$1en");
545 inflect.addPluralize("(quiz)$", "$1zes");
546 // Need to check for the following words that are already pluralized:
547 inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", "$1"); // irregulars
548 inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", "$1"); // special rules
549
550 inflect.addSingularize("s$", "");
551 inflect.addSingularize("(s|si|u)s$", "$1s"); // '-us' and '-ss' are already singular
552 inflect.addSingularize("(n)ews$", "$1ews");
553 inflect.addSingularize("([ti])a$", "$1um");
554 inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis");
555 inflect.addSingularize("(^analy)ses$", "$1sis");
556 inflect.addSingularize("(^analy)sis$", "$1sis"); // already singular, but ends in 's'
557 inflect.addSingularize("([^f])ves$", "$1fe");
558 inflect.addSingularize("(hive)s$", "$1");
559 inflect.addSingularize("(tive)s$", "$1");
560 inflect.addSingularize("([lr])ves$", "$1f");
561 inflect.addSingularize("([^aeiouy]|qu)ies$", "$1y");
562 inflect.addSingularize("(s)eries$", "$1eries");
563 inflect.addSingularize("(m)ovies$", "$1ovie");
564 inflect.addSingularize("(x|ch|ss|sh)es$", "$1");
565 inflect.addSingularize("([m|l])ice$", "$1ouse");
566 inflect.addSingularize("(bus)es$", "$1");
567 inflect.addSingularize("(o)es$", "$1");
568 inflect.addSingularize("(shoe)s$", "$1");
569 inflect.addSingularize("(cris|ax|test)is$", "$1is"); // already singular, but ends in 's'
570 inflect.addSingularize("(cris|ax|test)es$", "$1is");
571 inflect.addSingularize("(octop|vir)i$", "$1us");
572 inflect.addSingularize("(octop|vir)us$", "$1us"); // already singular, but ends in 's'
573 inflect.addSingularize("(alias|status)es$", "$1");
574 inflect.addSingularize("(alias|status)$", "$1"); // already singular, but ends in 's'
575 inflect.addSingularize("^(ox)en", "$1");
576 inflect.addSingularize("(vert|ind)ices$", "$1ex");
577 inflect.addSingularize("(matr)ices$", "$1ix");
578 inflect.addSingularize("(quiz)zes$", "$1");
579
580 inflect.addIrregular("person", "people");
581 inflect.addIrregular("man", "men");
582 inflect.addIrregular("child", "children");
583 inflect.addIrregular("sex", "sexes");
584 inflect.addIrregular("move", "moves");
585 inflect.addIrregular("stadium", "stadiums");
586
587 inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep");
588 }
589
590 }