001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006    * See the AUTHORS.txt file in the distribution for a full listing of 
007    * individual contributors.
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.sequencer.msoffice.excel;
025    
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.util.ArrayList;
029    import java.util.List;
030    
031    import org.apache.poi.hssf.usermodel.HSSFCell;
032    import org.apache.poi.hssf.usermodel.HSSFComment;
033    import org.apache.poi.hssf.usermodel.HSSFRow;
034    import org.apache.poi.hssf.usermodel.HSSFSheet;
035    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
036    import org.apache.poi.poifs.filesystem.POIFSFileSystem;
037    
038    /**
039     * @author Michael Trezzi
040     * @description Extracts data and some metadata from excel files
041     */
042    public class ExcelMetadataReader {
043    
044        /** The character to output after each row. */
045        private static final char ROW_DELIMITER_CHAR = '\n';
046        /** The character to output after each cell (column). */
047        private static final char CELL_DELIMITER_CHAR = '\t';
048        
049        public static ExcelMetadata instance( InputStream stream ) throws IOException {
050            ExcelMetadata metadata = new ExcelMetadata();
051            HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(stream));
052    
053            StringBuffer buff = new StringBuffer();
054            List<String> sheets = new ArrayList<String>();
055            
056            for (int sheetInd = 0; sheetInd < wb.getNumberOfSheets(); sheetInd++) {
057                sheets.add(wb.getSheetName(sheetInd));
058    
059                HSSFSheet worksheet = wb.getSheetAt(sheetInd);
060                int lastRowNum = worksheet.getLastRowNum();
061                
062                for (int rowNum = worksheet.getFirstRowNum(); rowNum <= lastRowNum; rowNum++) {
063                    HSSFRow row = worksheet.getRow(rowNum);
064                    
065                    // Empty rows are returned as null
066                    if (row == null) {
067                        continue;
068                    }
069                    
070                    int lastCellNum = row.getLastCellNum();
071                    for (int cellNum = row.getFirstCellNum(); cellNum < lastCellNum; cellNum++) {
072                        HSSFCell cell = row.getCell(cellNum);
073                        
074                        /*
075                         * Builds a string of body content from all string, numeric,
076                         * and formula values in the body of each worksheet.
077                         * 
078                         *  This code currently duplicates the POI 3.1 ExcelExtractor behavior of
079                         *  combining the body text from all worksheets into a single string.
080                         */
081                        switch (cell.getCellType()) {
082                            case HSSFCell.CELL_TYPE_STRING:
083                                buff.append(cell.getRichStringCellValue().getString());
084                                break;
085                            case HSSFCell.CELL_TYPE_NUMERIC:
086                                buff.append(cell.getNumericCellValue());
087                                break;
088                            case HSSFCell.CELL_TYPE_FORMULA:
089                                buff.append(cell.getCellFormula());
090                                break;
091                        }
092    
093                        HSSFComment comment = cell.getCellComment();
094                        if (comment != null) {
095                            // Filter out row delimiter characters from comment
096                            String commentText = comment.getString().getString().replace(ROW_DELIMITER_CHAR, ' ');
097                            
098                            buff.append(" [");
099                            buff.append(commentText);
100                            buff.append(" by ");
101                            buff.append(comment.getAuthor());
102                            buff.append(']');
103                        }
104                        
105                        if (cellNum < lastCellNum - 1) {
106                            buff.append(CELL_DELIMITER_CHAR);
107                        }
108                        else {
109                            buff.append(ROW_DELIMITER_CHAR);
110                        }
111                    }
112                }
113            }
114            
115            metadata.setText(buff.toString());
116            metadata.setSheets(sheets);
117            return metadata;
118        }
119    }