001 /*
002 * JBoss DNA (http://www.jboss.org/dna)
003 * See the COPYRIGHT.txt file distributed with this work for information
004 * regarding copyright ownership. Some portions may be licensed
005 * to Red Hat, Inc. under one or more contributor license agreements.
006 * See the AUTHORS.txt file in the distribution for a full listing of
007 * individual contributors.
008 *
009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010 * is licensed to you under the terms of the GNU Lesser General Public License as
011 * published by the Free Software Foundation; either version 2.1 of
012 * the License, or (at your option) any later version.
013 *
014 * JBoss DNA is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017 * Lesser General Public License for more details.
018 *
019 * You should have received a copy of the GNU Lesser General Public
020 * License along with this software; if not, write to the Free
021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023 */
024
025 package org.jboss.dna.sequencer.msoffice.word;
026
027 import java.io.IOException;
028 import java.io.InputStream;
029 import java.util.ArrayList;
030 import java.util.List;
031
032 import org.apache.poi.hwpf.HWPFDocument;
033 import org.apache.poi.hwpf.model.StyleSheet;
034 import org.apache.poi.hwpf.usermodel.Paragraph;
035 import org.apache.poi.hwpf.usermodel.Range;
036 import org.jboss.dna.common.util.Logger;
037
038 /**
039 * Infers table of contents from Word document by reading all paragraphs
040 * with style <code>Heading*</code>. This is analogous to the default
041 * behavior of Word when generating a table of contents.
042 *
043 * @author Michael Trezzi
044 */
045 public class WordMetadataReader {
046
047 private static final Logger log = Logger.getLogger(WordMetadataReader.class);
048
049 /** Prefix for styles that will be extracted and treated as outline information for the document */
050 private static final String HEADER_PREFIX = "Heading";
051
052 public static WordMetadata instance( InputStream stream ) throws IOException {
053 WordMetadata metadata = new WordMetadata();
054 List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();
055
056 HWPFDocument document = new HWPFDocument(stream);
057 Range range = document.getRange();
058
059 StyleSheet stylesheet = document.getStyleSheet();
060
061 for (int i = 0; i < range.numParagraphs(); i++) {
062 Paragraph paragraph = range.getParagraph(i);
063
064 String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();
065
066 if (styleName.startsWith(HEADER_PREFIX)) {
067 String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
068 int levelNum = 0;
069
070 try {
071 levelNum = Integer.parseInt(rawLevelNum);
072 }
073 catch (NumberFormatException nfe) {
074 log.debug("Could not parse heading level from: " + styleName);
075 }
076
077 String text = Paragraph.stripFields(paragraph.text());
078
079 if ('\r' == text.charAt(text.length() - 1)) {
080 text = text.substring(0, text.length() - 1);
081 }
082
083 headings.add(new WordMetadata.WordHeading(text, levelNum));
084 }
085 }
086
087 metadata.setHeadings(headings);
088 return metadata;
089 }
090 }