1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10 * is licensed to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24
25 package org.modeshape.sequencer.msoffice.word;
26
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.util.ArrayList;
30 import java.util.List;
31 import org.apache.poi.hwpf.HWPFDocument;
32 import org.apache.poi.hwpf.model.StyleSheet;
33 import org.apache.poi.hwpf.usermodel.Paragraph;
34 import org.apache.poi.hwpf.usermodel.Range;
35 import org.modeshape.common.util.Logger;
36
37 /**
38 * Infers table of contents from Word document by reading all paragraphs with style <code>Heading*</code>. This is analogous to
39 * the default behavior of Word when generating a table of contents.
40 */
41 public class WordMetadataReader {
42
43 private static final Logger log = Logger.getLogger(WordMetadataReader.class);
44
45 /** Prefix for styles that will be extracted and treated as outline information for the document */
46 private static final String HEADER_PREFIX = "Heading";
47
48 public static WordMetadata instance( InputStream stream ) throws IOException {
49 WordMetadata metadata = new WordMetadata();
50 List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();
51
52 HWPFDocument document = new HWPFDocument(stream);
53 Range range = document.getRange();
54
55 StyleSheet stylesheet = document.getStyleSheet();
56
57 for (int i = 0; i < range.numParagraphs(); i++) {
58 Paragraph paragraph = range.getParagraph(i);
59
60 String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();
61
62 if (styleName.startsWith(HEADER_PREFIX)) {
63 String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
64 int levelNum = 0;
65
66 try {
67 levelNum = Integer.parseInt(rawLevelNum);
68 } catch (NumberFormatException nfe) {
69 log.debug("Could not parse heading level from: " + styleName);
70 }
71
72 String text = Paragraph.stripFields(paragraph.text());
73
74 if ('\r' == text.charAt(text.length() - 1)) {
75 text = text.substring(0, text.length() - 1);
76 }
77
78 headings.add(new WordMetadata.WordHeading(text, levelNum));
79 }
80 }
81
82 metadata.setHeadings(headings);
83 return metadata;
84 }
85 }