001 /*
002 * JBoss DNA (http://www.jboss.org/dna)
003 * See the COPYRIGHT.txt file distributed with this work for information
004 * regarding copyright ownership. Some portions may be licensed
005 * to Red Hat, Inc. under one or more contributor license agreements.
006 * See the AUTHORS.txt file in the distribution for a full listing of
007 * individual contributors.
008 *
009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010 * is licensed to you under the terms of the GNU Lesser General Public License as
011 * published by the Free Software Foundation; either version 2.1 of
012 * the License, or (at your option) any later version.
013 *
014 * JBoss DNA is distributed in the hope that it will be useful,
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017 * Lesser General Public License for more details.
018 *
019 * You should have received a copy of the GNU Lesser General Public
020 * License along with this software; if not, write to the Free
021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023 */
024 package org.jboss.dna.sequencer.msoffice;
025
026 import java.io.IOException;
027 import java.io.InputStream;
028 import java.util.Iterator;
029 import java.util.List;
030 import org.jboss.dna.graph.sequencer.StreamSequencerContext;
031 import org.jboss.dna.graph.sequencer.SequencerOutput;
032 import org.jboss.dna.graph.sequencer.StreamSequencer;
033 import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadata;
034 import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader;
035 import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
036 import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata;
037 import org.jboss.dna.sequencer.msoffice.word.WordMetadata;
038 import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader;
039
040 /**
041 * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
042 * metadata to the repository.
043 * <p>
044 * This sequencer produces data that corresponds to the following structure:
045 * <ul>
046 * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
047 * <ul>
048 * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
049 * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
050 * <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
051 * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
052 * <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
053 * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
054 * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
055 * <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
056 * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
057 * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
058 * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
059 * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
060 * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
061 * <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
062 * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
063 * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
064 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
065 * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
066 * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
067 * </ul>
068 * </li>
069 * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
070 * <ul>
071 * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
072 * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
073 * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
074 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
075 * </ul>
076 * </li>
077 * </ul>
078 * </p>
079 *
080 * @author Michael Trezzi
081 * @author John Verhaeg
082 */
083 public class MSOfficeMetadataSequencer implements StreamSequencer {
084
085 public static final String METADATA_NODE = "msoffice:metadata";
086 public static final String MSOFFICE_PRIMARY_TYPE = "jcr:primaryType";
087 public static final String MSOFFICE_TITLE = "msoffice:title";
088 public static final String MSOFFICE_SUBJECT = "msoffice:subject";
089 public static final String MSOFFICE_AUTHOR = "msoffice:author";
090 public static final String MSOFFICE_KEYWORDS = "msoffice:keywords";
091 public static final String MSOFFICE_COMMENT = "msoffice:comment";
092 public static final String MSOFFICE_TEMPLATE = "msoffice:template";
093 public static final String MSOFFICE_LAST_SAVED_BY = "msoffice:last_saved_by";
094 public static final String MSOFFICE_REVISION = "msoffice:revision";
095 public static final String MSOFFICE_TOTAL_EDITING_TIME = "msoffice:total_editing_time";
096 public static final String MSOFFICE_LAST_PRINTED = "msoffice:last_printed";
097 public static final String MSOFFICE_CREATED = "msoffice:created";
098 public static final String MSOFFICE_SAVED = "msoffice:saved";
099 public static final String MSOFFICE_PAGES = "msoffice:pages";
100 public static final String MSOFFICE_WORDS = "msoffice:words";
101 public static final String MSOFFICE_CHARACTERS = "msoffice:characters";
102 public static final String MSOFFICE_CREATING_APPLICATION = "msoffice:creating_application";
103 public static final String MSOFFICE_THUMBNAIL = "msoffice:thumbnail";
104
105 // PowerPoint specific
106 public static final String POWERPOINT_SLIDE_NODE = "msoffice:slide";
107 public static final String SLIDE_TITLE = "msoffice:title";
108 public static final String SLIDE_TEXT = "msoffice:text";
109 public static final String SLIDE_NOTES = "msoffice:notes";
110 public static final String SLIDE_THUMBNAIL = "msoffice:thumbnail";
111
112 // Excel specific
113 public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents";
114 public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name";
115
116 // Word specific
117 public static final String WORD_HEADING_NODE = "msoffice:heading";
118 public static final String WORD_HEADING_NAME = "msoffice:heading_name";
119 public static final String WORD_HEADING_LEVEL = "msoffice:heading_level";
120
121 /**
122 * {@inheritDoc}
123 */
124 public void sequence( InputStream stream,
125 SequencerOutput output,
126 StreamSequencerContext context ) {
127
128 MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream);
129
130 String mimeType = context.getMimeType();
131
132 if (metadata != null) {
133 output.setProperty(METADATA_NODE, MSOFFICE_PRIMARY_TYPE, "msoffice:metadata");
134 output.setProperty(METADATA_NODE, MSOFFICE_TITLE, metadata.getTitle());
135 output.setProperty(METADATA_NODE, MSOFFICE_SUBJECT, metadata.getSubject());
136 output.setProperty(METADATA_NODE, MSOFFICE_AUTHOR, metadata.getAuthor());
137 output.setProperty(METADATA_NODE, MSOFFICE_KEYWORDS, metadata.getKeywords());
138 output.setProperty(METADATA_NODE, MSOFFICE_COMMENT, metadata.getComment());
139 output.setProperty(METADATA_NODE, MSOFFICE_TEMPLATE, metadata.getTemplate());
140 output.setProperty(METADATA_NODE, MSOFFICE_LAST_SAVED_BY, metadata.getLastSavedBy());
141 output.setProperty(METADATA_NODE, MSOFFICE_REVISION, metadata.getRevision());
142 output.setProperty(METADATA_NODE, MSOFFICE_TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
143 output.setProperty(METADATA_NODE, MSOFFICE_LAST_PRINTED, metadata.getLastPrinted());
144 output.setProperty(METADATA_NODE, MSOFFICE_CREATED, metadata.getCreated());
145 output.setProperty(METADATA_NODE, MSOFFICE_SAVED, metadata.getSaved());
146 output.setProperty(METADATA_NODE, MSOFFICE_PAGES, metadata.getPages());
147 output.setProperty(METADATA_NODE, MSOFFICE_WORDS, metadata.getWords());
148 output.setProperty(METADATA_NODE, MSOFFICE_CHARACTERS, metadata.getCharacters());
149 output.setProperty(METADATA_NODE, MSOFFICE_CREATING_APPLICATION, metadata.getCreatingApplication());
150 output.setProperty(METADATA_NODE, MSOFFICE_THUMBNAIL, metadata.getThumbnail());
151
152 } else {
153 return;
154 }
155
156 // process PowerPoint specific metadata
157 if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced
158 try {
159 List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream);
160 if (ppt_metadata != null) {
161 for (SlideMetadata sm : ppt_metadata) {
162 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TITLE, sm.getTitle());
163 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TEXT, sm.getText());
164 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_NOTES, sm.getNotes());
165 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_THUMBNAIL, sm.getThumbnail());
166 }
167 }
168 } catch (IOException e) {
169 // There was an error reading, so log and continue ...
170 context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata");
171 }
172 }
173
174 if (mimeType.equals("application/vnd.ms-word")) {
175 // Sometime in the future this will sequence WORD Table of contents.
176 try {
177 WordMetadata wordMetadata = WordMetadataReader.instance(stream);
178
179 for (Iterator<WordMetadata.WordHeading> iter = wordMetadata.getHeadings().iterator(); iter.hasNext(); ) {
180 WordMetadata.WordHeading heading = iter.next();
181
182 output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_NAME, heading.getText());
183 output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_LEVEL, heading.getHeaderLevel());
184
185 }
186
187 } catch (IOException e) {
188 // There was an error reading, so log and continue ...
189 context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata");
190 }
191
192 }
193
194 if (mimeType.equals("application/vnd.ms-excel")) {
195 try {
196 ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream);
197 if (excel_metadata != null) {
198 output.setProperty(METADATA_NODE, EXCEL_FULL_CONTENT, excel_metadata.getText());
199 for (String sheet : excel_metadata.getSheets()) {
200 output.setProperty(METADATA_NODE, EXCEL_SHEET_NAME, sheet);
201 }
202 }
203 } catch (IOException e) {
204 // There was an error reading, so log and continue ...
205 context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata");
206 }
207 }
208 }
209 }