001 /*
002 * JBoss DNA (http://www.jboss.org/dna)
003 * See the COPYRIGHT.txt file distributed with this work for information
004 * regarding copyright ownership. Some portions may be licensed
005 * to Red Hat, Inc. under one or more contributor license agreements.
006 * See the AUTHORS.txt file in the distribution for a full listing of
007 * individual contributors.
008 *
009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010 * is licensed to you under the terms of the GNU Lesser General Public License as
011 * published by the Free Software Foundation; either version 2.1 of
012 * the License, or (at your option) any later version.
013 *
014 * JBoss DNA is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017 * Lesser General Public License for more details.
018 *
019 * You should have received a copy of the GNU Lesser General Public
020 * License along with this software; if not, write to the Free
021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023 */
024 package org.jboss.dna.common.text;
025
026 import java.util.HashSet;
027 import java.util.LinkedList;
028 import java.util.Set;
029 import java.util.regex.Matcher;
030 import java.util.regex.Pattern;
031 import org.jboss.dna.common.util.CheckArg;
032
033 /**
034 * Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by
035 * the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a
036 * href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a
037 * href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>.
038 *
039 * @author Randall Hauch
040 */
041 public class Inflector {
042
043 protected static final Inflector INSTANCE = new Inflector();
044
045 public static final Inflector getInstance() {
046 return INSTANCE;
047 }
048
049 protected class Rule {
050
051 protected final String expression;
052 protected final Pattern expressionPattern;
053 protected final String replacement;
054
055 protected Rule( String expression,
056 String replacement ) {
057 this.expression = expression;
058 this.replacement = replacement != null ? replacement : "";
059 this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE);
060 }
061
062 /**
063 * Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no
064 * modifications were made)
065 *
066 * @param input the input string
067 * @return the modified string if this rule applied, or null if the input was not modified by this rule
068 */
069 protected String apply( String input ) {
070 Matcher matcher = this.expressionPattern.matcher(input);
071 if (!matcher.find()) return null;
072 return matcher.replaceAll(this.replacement);
073 }
074
075 @Override
076 public int hashCode() {
077 return expression.hashCode();
078 }
079
080 @Override
081 public boolean equals( Object obj ) {
082 if (obj == this) return true;
083 if (obj != null && obj.getClass() == this.getClass()) {
084 final Rule that = (Rule)obj;
085 if (this.expression.equalsIgnoreCase(that.expression)) return true;
086 }
087 return false;
088 }
089
090 @Override
091 public String toString() {
092 return expression + ", " + replacement;
093 }
094 }
095
096 private LinkedList<Rule> plurals = new LinkedList<Rule>();
097 private LinkedList<Rule> singulars = new LinkedList<Rule>();
098 /**
099 * The lowercase words that are to be excluded and not processed. This map can be modified by the users via
100 * {@link #getUncountables()}.
101 */
102 private final Set<String> uncountables = new HashSet<String>();
103
104 public Inflector() {
105 initialize();
106 }
107
108 protected Inflector( Inflector original ) {
109 this.plurals.addAll(original.plurals);
110 this.singulars.addAll(original.singulars);
111 this.uncountables.addAll(original.uncountables);
112 }
113
114 @Override
115 public Inflector clone() {
116 return new Inflector(this);
117 }
118
119 // ------------------------------------------------------------------------------------------------
120 // Usage functions
121 // ------------------------------------------------------------------------------------------------
122
123 /**
124 * Returns the plural form of the word in the string.
125 * <p>
126 * Examples:
127 *
128 * <pre>
129 * inflector.pluralize("post") #=> "posts"
130 * inflector.pluralize("octopus") #=> "octopi"
131 * inflector.pluralize("sheep") #=> "sheep"
132 * inflector.pluralize("words") #=> "words"
133 * inflector.pluralize("the blue mailman") #=> "the blue mailmen"
134 * inflector.pluralize("CamelOctopus") #=> "CamelOctopi"
135 * </pre>
136 *
137 * </p>
138 * <p>
139 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
140 * </p>
141 *
142 * @param word the word that is to be pluralized.
143 * @return the pluralized form of the word, or the word itself if it could not be pluralized
144 * @see #singularize(Object)
145 */
146 public String pluralize( Object word ) {
147 if (word == null) return null;
148 String wordStr = word.toString().trim();
149 if (wordStr.length() == 0) return wordStr;
150 if (isUncountable(wordStr)) return wordStr;
151 for (Rule rule : this.plurals) {
152 String result = rule.apply(wordStr);
153 if (result != null) return result;
154 }
155 return wordStr;
156 }
157
158 public String pluralize( Object word,
159 int count ) {
160 if (word == null) return null;
161 if (count == 1 || count == -1) {
162 return word.toString();
163 }
164 return pluralize(word);
165 }
166
167 /**
168 * Returns the singular form of the word in the string.
169 * <p>
170 * Examples:
171 *
172 * <pre>
173 * inflector.singularize("posts") #=> "post"
174 * inflector.singularize("octopi") #=> "octopus"
175 * inflector.singularize("sheep") #=> "sheep"
176 * inflector.singularize("words") #=> "word"
177 * inflector.singularize("the blue mailmen") #=> "the blue mailman"
178 * inflector.singularize("CamelOctopi") #=> "CamelOctopus"
179 * </pre>
180 *
181 * </p>
182 * <p>
183 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too.
184 * </p>
185 *
186 * @param word the word that is to be pluralized.
187 * @return the pluralized form of the word, or the word itself if it could not be pluralized
188 * @see #pluralize(Object)
189 */
190 public String singularize( Object word ) {
191 if (word == null) return null;
192 String wordStr = word.toString().trim();
193 if (wordStr.length() == 0) return wordStr;
194 if (isUncountable(wordStr)) return wordStr;
195 for (Rule rule : this.singulars) {
196 String result = rule.apply(wordStr);
197 if (result != null) return result;
198 }
199 return wordStr;
200 }
201
202 /**
203 * Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
204 * <p>
205 * Examples:
206 *
207 * <pre>
208 * inflector.lowerCamelCase("active_record") #=> "activeRecord"
209 * inflector.lowerCamelCase("first_name") #=> "firstName"
210 * inflector.lowerCamelCase("name") #=> "name"
211 * inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName"
212 * </pre>
213 *
214 * </p>
215 *
216 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
217 * @param delimiterChars optional characters that are used to delimit word boundaries
218 * @return the lower camel case version of the word
219 * @see #underscore(String, char[])
220 * @see #camelCase(String, boolean, char[])
221 * @see #upperCamelCase(String, char[])
222 */
223 public String lowerCamelCase( String lowerCaseAndUnderscoredWord,
224 char... delimiterChars ) {
225 return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars);
226 }
227
228 /**
229 * Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries.
230 * <p>
231 * Examples:
232 *
233 * <pre>
234 * inflector.upperCamelCase("active_record") #=> "SctiveRecord"
235 * inflector.upperCamelCase("first_name") #=> "FirstName"
236 * inflector.upperCamelCase("name") #=> "Name"
237 * inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName"
238 * </pre>
239 *
240 * </p>
241 *
242 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
243 * @param delimiterChars optional characters that are used to delimit word boundaries
244 * @return the upper camel case version of the word
245 * @see #underscore(String, char[])
246 * @see #camelCase(String, boolean, char[])
247 * @see #lowerCamelCase(String, char[])
248 */
249 public String upperCamelCase( String lowerCaseAndUnderscoredWord,
250 char... delimiterChars ) {
251 return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars);
252 }
253
254 /**
255 * By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false,
256 * then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word
257 * boundaries.
258 * <p>
259 * Examples:
260 *
261 * <pre>
262 * inflector.camelCase("active_record",false) #=> "activeRecord"
263 * inflector.camelCase("active_record",true) #=> "ActiveRecord"
264 * inflector.camelCase("first_name",false) #=> "firstName"
265 * inflector.camelCase("first_name",true) #=> "FirstName"
266 * inflector.camelCase("name",false) #=> "name"
267 * inflector.camelCase("name",true) #=> "Name"
268 * </pre>
269 *
270 * </p>
271 *
272 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case
273 * @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be
274 * lowercased
275 * @param delimiterChars optional characters that are used to delimit word boundaries
276 * @return the camel case version of the word
277 * @see #underscore(String, char[])
278 * @see #upperCamelCase(String, char[])
279 * @see #lowerCamelCase(String, char[])
280 */
281 public String camelCase( String lowerCaseAndUnderscoredWord,
282 boolean uppercaseFirstLetter,
283 char... delimiterChars ) {
284 if (lowerCaseAndUnderscoredWord == null) return null;
285 lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim();
286 if (lowerCaseAndUnderscoredWord.length() == 0) return "";
287 if (uppercaseFirstLetter) {
288 String result = lowerCaseAndUnderscoredWord;
289 // Replace any extra delimiters with underscores (before the underscores are converted in the next step)...
290 if (delimiterChars != null) {
291 for (char delimiterChar : delimiterChars) {
292 result = result.replace(delimiterChar, '_');
293 }
294 }
295
296 // Change the case at the beginning at after each underscore ...
297 return replaceAllWithUppercase(result, "(^|_)(.)", 2);
298 }
299 if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord;
300 return "" + Character.toLowerCase(lowerCaseAndUnderscoredWord.charAt(0))
301 + camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1);
302 }
303
304 /**
305 * Makes an underscored form from the expression in the string (the reverse of the {@link #camelCase(String, boolean, char[])
306 * camelCase} method. Also changes any characters that match the supplied delimiters into underscore.
307 * <p>
308 * Examples:
309 *
310 * <pre>
311 * inflector.underscore("activeRecord") #=> "active_record"
312 * inflector.underscore("ActiveRecord") #=> "active_record"
313 * inflector.underscore("firstName") #=> "first_name"
314 * inflector.underscore("FirstName") #=> "first_name"
315 * inflector.underscore("name") #=> "name"
316 * inflector.underscore("The.firstName") #=> "the_first_name"
317 * </pre>
318 *
319 * </p>
320 *
321 * @param camelCaseWord the camel-cased word that is to be converted;
322 * @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization)
323 * @return a lower-cased version of the input, with separate words delimited by the underscore character.
324 */
325 public String underscore( String camelCaseWord,
326 char... delimiterChars ) {
327 if (camelCaseWord == null) return null;
328 String result = camelCaseWord.trim();
329 if (result.length() == 0) return "";
330 result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "$1_$2");
331 result = result.replaceAll("([a-z\\d])([A-Z])", "$1_$2");
332 result = result.replace('-', '_');
333 if (delimiterChars != null) {
334 for (char delimiterChar : delimiterChars) {
335 result = result.replace(delimiterChar, '_');
336 }
337 }
338 return result.toLowerCase();
339 }
340
341 /**
342 * Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase.
343 *
344 * @param words the word to be capitalized
345 * @return the string with the first character capitalized and the remaining characters lowercased
346 */
347 public String capitalize( String words ) {
348 if (words == null) return null;
349 String result = words.trim();
350 if (result.length() == 0) return "";
351 if (result.length() == 1) return result.toUpperCase();
352 return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase();
353 }
354
355 /**
356 * Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens.
357 * Like {@link #titleCase(String, String[])}, this is meant for creating pretty output.
358 * <p>
359 * Examples:
360 *
361 * <pre>
362 * inflector.humanize("employee_salary") #=> "Employee salary"
363 * inflector.humanize("author_id") #=> "Author"
364 * </pre>
365 *
366 * </p>
367 *
368 * @param lowerCaseAndUnderscoredWords the input to be humanized
369 * @param removableTokens optional array of tokens that are to be removed
370 * @return the humanized string
371 * @see #titleCase(String, String[])
372 */
373 public String humanize( String lowerCaseAndUnderscoredWords,
374 String... removableTokens ) {
375 if (lowerCaseAndUnderscoredWords == null) return null;
376 String result = lowerCaseAndUnderscoredWords.trim();
377 if (result.length() == 0) return "";
378 // Remove a trailing "_id" token
379 result = result.replaceAll("_id$", "");
380 // Remove all of the tokens that should be removed
381 if (removableTokens != null) {
382 for (String removableToken : removableTokens) {
383 result = result.replaceAll(removableToken, "");
384 }
385 }
386 result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space
387 return capitalize(result);
388 }
389
390 /**
391 * Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are
392 * changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like
393 * {@link #humanize(String, String[])}, this is meant for creating pretty output.
394 * <p>
395 * Examples:
396 *
397 * <pre>
398 * inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks"
399 * inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand"
400 * </pre>
401 *
402 * </p>
403 *
404 * @param words the input to be turned into title case
405 * @param removableTokens optional array of tokens that are to be removed
406 * @return the title-case version of the supplied words
407 */
408 public String titleCase( String words,
409 String... removableTokens ) {
410 String result = humanize(words, removableTokens);
411 result = replaceAllWithUppercase(result, "\\b([a-z])", 1); // change first char of each word to uppercase
412 return result;
413 }
414
415 /**
416 * Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd,
417 * 3rd, 4th.
418 *
419 * @param number the non-negative number
420 * @return the string with the number and ordinal suffix
421 */
422 public String ordinalize( int number ) {
423 int remainder = number % 100;
424 String numberStr = Integer.toString(number);
425 if (11 <= number && number <= 13) return numberStr + "th";
426 remainder = number % 10;
427 if (remainder == 1) return numberStr + "st";
428 if (remainder == 2) return numberStr + "nd";
429 if (remainder == 3) return numberStr + "rd";
430 return numberStr + "th";
431 }
432
433 // ------------------------------------------------------------------------------------------------
434 // Management methods
435 // ------------------------------------------------------------------------------------------------
436
437 /**
438 * Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and
439 * {@link #singularize(Object) singularize} methods.
440 *
441 * @param word the word
442 * @return true if the plural and singular forms of the word are the same
443 */
444 public boolean isUncountable( String word ) {
445 if (word == null) return false;
446 String trimmedLower = word.trim().toLowerCase();
447 return this.uncountables.contains(trimmedLower);
448 }
449
450 /**
451 * Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable.
452 *
453 * @return the set of uncountable words
454 */
455 public Set<String> getUncountables() {
456 return uncountables;
457 }
458
459 public void addPluralize( String rule,
460 String replacement ) {
461 final Rule pluralizeRule = new Rule(rule, replacement);
462 this.plurals.addFirst(pluralizeRule);
463 }
464
465 public void addSingularize( String rule,
466 String replacement ) {
467 final Rule singularizeRule = new Rule(rule, replacement);
468 this.singulars.addFirst(singularizeRule);
469 }
470
471 public void addIrregular( String singular,
472 String plural ) {
473 CheckArg.isNotEmpty(singular, "singular rule");
474 CheckArg.isNotEmpty(plural, "plural rule");
475 String singularRemainder = singular.length() > 1 ? singular.substring(1) : "";
476 String pluralRemainder = plural.length() > 1 ? plural.substring(1) : "";
477 addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "$1" + pluralRemainder);
478 addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "$1" + singularRemainder);
479 }
480
481 public void addUncountable( String... words ) {
482 if (words == null || words.length == 0) return;
483 for (String word : words) {
484 if (word != null) uncountables.add(word.trim().toLowerCase());
485 }
486 }
487
488 /**
489 * Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all
490 * other backreferences.
491 * <p>
492 * The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>,
493 * <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement string
494 * to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first backreference, and
495 * <code>\u3</code> would uppercase the 3rd backreference.
496 * </p>
497 *
498 * @param input
499 * @param regex
500 * @param groupNumberToUppercase
501 * @return the input string with the appropriate characters converted to upper-case
502 */
503 protected static String replaceAllWithUppercase( String input,
504 String regex,
505 int groupNumberToUppercase ) {
506 Pattern underscoreAndDotPattern = Pattern.compile(regex);
507 Matcher matcher = underscoreAndDotPattern.matcher(input);
508 StringBuffer sb = new StringBuffer();
509 while (matcher.find()) {
510 matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase());
511 }
512 matcher.appendTail(sb);
513 return sb.toString();
514 }
515
516 /**
517 * Completely remove all rules within this inflector.
518 */
519 public void clear() {
520 this.uncountables.clear();
521 this.plurals.clear();
522 this.singulars.clear();
523 }
524
525 protected void initialize() {
526 Inflector inflect = this;
527 inflect.addPluralize("$", "s");
528 inflect.addPluralize("s$", "s");
529 inflect.addPluralize("(ax|test)is$", "$1es");
530 inflect.addPluralize("(octop|vir)us$", "$1i");
531 inflect.addPluralize("(octop|vir)i$", "$1i"); // already plural
532 inflect.addPluralize("(alias|status)$", "$1es");
533 inflect.addPluralize("(bu)s$", "$1ses");
534 inflect.addPluralize("(buffal|tomat)o$", "$1oes");
535 inflect.addPluralize("([ti])um$", "$1a");
536 inflect.addPluralize("([ti])a$", "$1a"); // already plural
537 inflect.addPluralize("sis$", "ses");
538 inflect.addPluralize("(?:([^f])fe|([lr])f)$", "$1$2ves");
539 inflect.addPluralize("(hive)$", "$1s");
540 inflect.addPluralize("([^aeiouy]|qu)y$", "$1ies");
541 inflect.addPluralize("(x|ch|ss|sh)$", "$1es");
542 inflect.addPluralize("(matr|vert|ind)ix|ex$", "$1ices");
543 inflect.addPluralize("([m|l])ouse$", "$1ice");
544 inflect.addPluralize("([m|l])ice$", "$1ice");
545 inflect.addPluralize("^(ox)$", "$1en");
546 inflect.addPluralize("(quiz)$", "$1zes");
547 // Need to check for the following words that are already pluralized:
548 inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", "$1"); // irregulars
549 inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", "$1"); // special rules
550
551 inflect.addSingularize("s$", "");
552 inflect.addSingularize("(s|si|u)s$", "$1s"); // '-us' and '-ss' are already singular
553 inflect.addSingularize("(n)ews$", "$1ews");
554 inflect.addSingularize("([ti])a$", "$1um");
555 inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis");
556 inflect.addSingularize("(^analy)ses$", "$1sis");
557 inflect.addSingularize("(^analy)sis$", "$1sis"); // already singular, but ends in 's'
558 inflect.addSingularize("([^f])ves$", "$1fe");
559 inflect.addSingularize("(hive)s$", "$1");
560 inflect.addSingularize("(tive)s$", "$1");
561 inflect.addSingularize("([lr])ves$", "$1f");
562 inflect.addSingularize("([^aeiouy]|qu)ies$", "$1y");
563 inflect.addSingularize("(s)eries$", "$1eries");
564 inflect.addSingularize("(m)ovies$", "$1ovie");
565 inflect.addSingularize("(x|ch|ss|sh)es$", "$1");
566 inflect.addSingularize("([m|l])ice$", "$1ouse");
567 inflect.addSingularize("(bus)es$", "$1");
568 inflect.addSingularize("(o)es$", "$1");
569 inflect.addSingularize("(shoe)s$", "$1");
570 inflect.addSingularize("(cris|ax|test)is$", "$1is"); // already singular, but ends in 's'
571 inflect.addSingularize("(cris|ax|test)es$", "$1is");
572 inflect.addSingularize("(octop|vir)i$", "$1us");
573 inflect.addSingularize("(octop|vir)us$", "$1us"); // already singular, but ends in 's'
574 inflect.addSingularize("(alias|status)es$", "$1");
575 inflect.addSingularize("(alias|status)$", "$1"); // already singular, but ends in 's'
576 inflect.addSingularize("^(ox)en", "$1");
577 inflect.addSingularize("(vert|ind)ices$", "$1ex");
578 inflect.addSingularize("(matr)ices$", "$1ix");
579 inflect.addSingularize("(quiz)zes$", "$1");
580
581 inflect.addIrregular("person", "people");
582 inflect.addIrregular("man", "men");
583 inflect.addIrregular("child", "children");
584 inflect.addIrregular("sex", "sexes");
585 inflect.addIrregular("move", "moves");
586 inflect.addIrregular("stadium", "stadiums");
587
588 inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep");
589 }
590
591 }