View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6   * See the AUTHORS.txt file in the distribution for a full listing of 
7   * individual contributors.
8    *
9    * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10   * is licensed to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   *
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  
25  package org.modeshape.sequencer.msoffice.word;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.util.ArrayList;
30  import java.util.List;
31  import org.apache.poi.hwpf.HWPFDocument;
32  import org.apache.poi.hwpf.model.StyleSheet;
33  import org.apache.poi.hwpf.usermodel.Paragraph;
34  import org.apache.poi.hwpf.usermodel.Range;
35  import org.modeshape.common.util.Logger;
36  
37  /**
38   * Infers table of contents from Word document by reading all paragraphs with style <code>Heading*</code>. This is analogous to
39   * the default behavior of Word when generating a table of contents.
40   */
41  public class WordMetadataReader {
42  
43      private static final Logger log = Logger.getLogger(WordMetadataReader.class);
44  
45      /** Prefix for styles that will be extracted and treated as outline information for the document */
46      private static final String HEADER_PREFIX = "Heading";
47  
48      public static WordMetadata instance( InputStream stream ) throws IOException {
49          WordMetadata metadata = new WordMetadata();
50          List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();
51  
52          HWPFDocument document = new HWPFDocument(stream);
53          Range range = document.getRange();
54  
55          StyleSheet stylesheet = document.getStyleSheet();
56  
57          for (int i = 0; i < range.numParagraphs(); i++) {
58              Paragraph paragraph = range.getParagraph(i);
59  
60              String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();
61  
62              if (styleName.startsWith(HEADER_PREFIX)) {
63                  String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
64                  int levelNum = 0;
65  
66                  try {
67                      levelNum = Integer.parseInt(rawLevelNum);
68                  } catch (NumberFormatException nfe) {
69                      log.debug("Could not parse heading level from: " + styleName);
70                  }
71  
72                  String text = Paragraph.stripFields(paragraph.text());
73  
74                  if ('\r' == text.charAt(text.length() - 1)) {
75                      text = text.substring(0, text.length() - 1);
76                  }
77  
78                  headings.add(new WordMetadata.WordHeading(text, levelNum));
79              }
80          }
81  
82          metadata.setHeadings(headings);
83          return metadata;
84      }
85  }