001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006    * See the AUTHORS.txt file in the distribution for a full listing of 
007    * individual contributors.
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.sequencer.msoffice;
025    
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.util.Iterator;
029    import java.util.List;
030    import org.jboss.dna.graph.sequencer.StreamSequencerContext;
031    import org.jboss.dna.graph.sequencer.SequencerOutput;
032    import org.jboss.dna.graph.sequencer.StreamSequencer;
033    import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadata;
034    import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader;
035    import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
036    import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata;
037    import org.jboss.dna.sequencer.msoffice.word.WordMetadata;
038    import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader;
039    
040    /**
041     * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
042     * metadata to the repository.
043     * <p>
044     * This sequencer produces data that corresponds to the following structure:
045     * <ul>
046     * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
047     * <ul>
048     * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
049     * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
050     * <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
051     * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
052     * <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
053     * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
054     * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
055     * <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
056     * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
057     * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
058     * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
059     * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
060     * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
061     * <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
062     * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
063     * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
064     * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
065     * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
066     * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
067     * </ul>
068     * </li>
069     * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
070     * <ul>
071     * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
072     * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
073     * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
074     * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
075     * </ul>
076     * </li>
077     * </ul>
078     * </p>
079     * 
080     * @author Michael Trezzi
081     * @author John Verhaeg
082     */
083    public class MSOfficeMetadataSequencer implements StreamSequencer {
084    
085        public static final String METADATA_NODE = "msoffice:metadata";
086        public static final String MSOFFICE_PRIMARY_TYPE = "jcr:primaryType";
087        public static final String MSOFFICE_TITLE = "msoffice:title";
088        public static final String MSOFFICE_SUBJECT = "msoffice:subject";
089        public static final String MSOFFICE_AUTHOR = "msoffice:author";
090        public static final String MSOFFICE_KEYWORDS = "msoffice:keywords";
091        public static final String MSOFFICE_COMMENT = "msoffice:comment";
092        public static final String MSOFFICE_TEMPLATE = "msoffice:template";
093        public static final String MSOFFICE_LAST_SAVED_BY = "msoffice:last_saved_by";
094        public static final String MSOFFICE_REVISION = "msoffice:revision";
095        public static final String MSOFFICE_TOTAL_EDITING_TIME = "msoffice:total_editing_time";
096        public static final String MSOFFICE_LAST_PRINTED = "msoffice:last_printed";
097        public static final String MSOFFICE_CREATED = "msoffice:created";
098        public static final String MSOFFICE_SAVED = "msoffice:saved";
099        public static final String MSOFFICE_PAGES = "msoffice:pages";
100        public static final String MSOFFICE_WORDS = "msoffice:words";
101        public static final String MSOFFICE_CHARACTERS = "msoffice:characters";
102        public static final String MSOFFICE_CREATING_APPLICATION = "msoffice:creating_application";
103        public static final String MSOFFICE_THUMBNAIL = "msoffice:thumbnail";
104    
105        // PowerPoint specific
106        public static final String POWERPOINT_SLIDE_NODE = "msoffice:slide";
107        public static final String SLIDE_TITLE = "msoffice:title";
108        public static final String SLIDE_TEXT = "msoffice:text";
109        public static final String SLIDE_NOTES = "msoffice:notes";
110        public static final String SLIDE_THUMBNAIL = "msoffice:thumbnail";
111    
112        // Excel specific
113        public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents";
114        public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name";
115        
116        // Word specific
117        public static final String WORD_HEADING_NODE = "msoffice:heading";
118        public static final String WORD_HEADING_NAME = "msoffice:heading_name";
119        public static final String WORD_HEADING_LEVEL = "msoffice:heading_level";
120    
121        /**
122         * {@inheritDoc}
123         */
124        public void sequence( InputStream stream,
125                              SequencerOutput output,
126                              StreamSequencerContext context ) {
127    
128            MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream);
129    
130            String mimeType = context.getMimeType();
131    
132            if (metadata != null) {
133                output.setProperty(METADATA_NODE, MSOFFICE_PRIMARY_TYPE, "msoffice:metadata");
134                output.setProperty(METADATA_NODE, MSOFFICE_TITLE, metadata.getTitle());
135                output.setProperty(METADATA_NODE, MSOFFICE_SUBJECT, metadata.getSubject());
136                output.setProperty(METADATA_NODE, MSOFFICE_AUTHOR, metadata.getAuthor());
137                output.setProperty(METADATA_NODE, MSOFFICE_KEYWORDS, metadata.getKeywords());
138                output.setProperty(METADATA_NODE, MSOFFICE_COMMENT, metadata.getComment());
139                output.setProperty(METADATA_NODE, MSOFFICE_TEMPLATE, metadata.getTemplate());
140                output.setProperty(METADATA_NODE, MSOFFICE_LAST_SAVED_BY, metadata.getLastSavedBy());
141                output.setProperty(METADATA_NODE, MSOFFICE_REVISION, metadata.getRevision());
142                output.setProperty(METADATA_NODE, MSOFFICE_TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
143                output.setProperty(METADATA_NODE, MSOFFICE_LAST_PRINTED, metadata.getLastPrinted());
144                output.setProperty(METADATA_NODE, MSOFFICE_CREATED, metadata.getCreated());
145                output.setProperty(METADATA_NODE, MSOFFICE_SAVED, metadata.getSaved());
146                output.setProperty(METADATA_NODE, MSOFFICE_PAGES, metadata.getPages());
147                output.setProperty(METADATA_NODE, MSOFFICE_WORDS, metadata.getWords());
148                output.setProperty(METADATA_NODE, MSOFFICE_CHARACTERS, metadata.getCharacters());
149                output.setProperty(METADATA_NODE, MSOFFICE_CREATING_APPLICATION, metadata.getCreatingApplication());
150                output.setProperty(METADATA_NODE, MSOFFICE_THUMBNAIL, metadata.getThumbnail());
151    
152            } else {
153                return;
154            }
155    
156            // process PowerPoint specific metadata
157            if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced
158                try {
159                    List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream);
160                    if (ppt_metadata != null) {
161                        for (SlideMetadata sm : ppt_metadata) {
162                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TITLE, sm.getTitle());
163                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TEXT, sm.getText());
164                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_NOTES, sm.getNotes());
165                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_THUMBNAIL, sm.getThumbnail());
166                        }
167                    }
168                } catch (IOException e) {
169                    // There was an error reading, so log and continue ...
170                    context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata");
171                }
172            }
173    
174            if (mimeType.equals("application/vnd.ms-word")) {
175                // Sometime in the future this will sequence WORD Table of contents.
176                try {
177                    WordMetadata wordMetadata = WordMetadataReader.instance(stream);
178    
179                    for (Iterator<WordMetadata.WordHeading> iter = wordMetadata.getHeadings().iterator(); iter.hasNext(); ) {
180                        WordMetadata.WordHeading heading = iter.next();
181                        
182                        output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_NAME, heading.getText());
183                        output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_LEVEL, heading.getHeaderLevel());
184                        
185                    }
186                    
187                } catch (IOException e) {
188                    // There was an error reading, so log and continue ...
189                    context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata");
190                }
191    
192            }
193    
194            if (mimeType.equals("application/vnd.ms-excel")) {
195                try {
196                    ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream);
197                    if (excel_metadata != null) {
198                        output.setProperty(METADATA_NODE, EXCEL_FULL_CONTENT, excel_metadata.getText());
199                        for (String sheet : excel_metadata.getSheets()) {
200                            output.setProperty(METADATA_NODE, EXCEL_SHEET_NAME, sheet);
201                        }
202                    }
203                } catch (IOException e) {
204                    // There was an error reading, so log and continue ...
205                    context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata");
206                }
207            }
208        }
209    }