1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10 * is licensed to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24 package org.modeshape.sequencer.msoffice.excel;
25
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.util.ArrayList;
29 import java.util.List;
30 import org.apache.poi.hssf.usermodel.HSSFCell;
31 import org.apache.poi.hssf.usermodel.HSSFComment;
32 import org.apache.poi.hssf.usermodel.HSSFRow;
33 import org.apache.poi.hssf.usermodel.HSSFSheet;
34 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
35 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
36
37 /**
38 * Extracts data and some metadata from Excel files
39 */
40 public class ExcelMetadataReader {
41
42 /** The character to output after each row. */
43 private static final char ROW_DELIMITER_CHAR = '\n';
44 /** The character to output after each cell (column). */
45 private static final char CELL_DELIMITER_CHAR = '\t';
46
47 public static ExcelMetadata instance( InputStream stream ) throws IOException {
48 ExcelMetadata metadata = new ExcelMetadata();
49 HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(stream));
50
51 StringBuffer buff = new StringBuffer();
52 List<String> sheets = new ArrayList<String>();
53
54 for (int sheetInd = 0; sheetInd < wb.getNumberOfSheets(); sheetInd++) {
55 sheets.add(wb.getSheetName(sheetInd));
56
57 HSSFSheet worksheet = wb.getSheetAt(sheetInd);
58 int lastRowNum = worksheet.getLastRowNum();
59
60 for (int rowNum = worksheet.getFirstRowNum(); rowNum <= lastRowNum; rowNum++) {
61 HSSFRow row = worksheet.getRow(rowNum);
62
63 // Empty rows are returned as null
64 if (row == null) {
65 continue;
66 }
67
68 int lastCellNum = row.getLastCellNum();
69 for (int cellNum = row.getFirstCellNum(); cellNum < lastCellNum; cellNum++) {
70 HSSFCell cell = row.getCell(cellNum);
71
72 /*
73 * Builds a string of body content from all string, numeric,
74 * and formula values in the body of each worksheet.
75 *
76 * This code currently duplicates the POI 3.1 ExcelExtractor behavior of
77 * combining the body text from all worksheets into a single string.
78 */
79 switch (cell.getCellType()) {
80 case HSSFCell.CELL_TYPE_STRING:
81 buff.append(cell.getRichStringCellValue().getString());
82 break;
83 case HSSFCell.CELL_TYPE_NUMERIC:
84 buff.append(cell.getNumericCellValue());
85 break;
86 case HSSFCell.CELL_TYPE_FORMULA:
87 buff.append(cell.getCellFormula());
88 break;
89 }
90
91 HSSFComment comment = cell.getCellComment();
92 if (comment != null) {
93 // Filter out row delimiter characters from comment
94 String commentText = comment.getString().getString().replace(ROW_DELIMITER_CHAR, ' ');
95
96 buff.append(" [");
97 buff.append(commentText);
98 buff.append(" by ");
99 buff.append(comment.getAuthor());
100 buff.append(']');
101 }
102
103 if (cellNum < lastCellNum - 1) {
104 buff.append(CELL_DELIMITER_CHAR);
105 } else {
106 buff.append(ROW_DELIMITER_CHAR);
107 }
108 }
109 }
110 }
111
112 metadata.setText(buff.toString());
113 metadata.setSheets(sheets);
114 return metadata;
115 }
116 }