1 /*
2 * ModeShape (http://www.modeshape.org)
3 * See the COPYRIGHT.txt file distributed with this work for information
4 * regarding copyright ownership. Some portions may be licensed
5 * to Red Hat, Inc. under one or more contributor license agreements.
6 * See the AUTHORS.txt file in the distribution for a full listing of
7 * individual contributors.
8 *
9 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10 * is licensed to you under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * ModeShape is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this software; if not, write to the Free
21 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23 */
24 package org.modeshape.sequencer.msoffice;
25
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.util.Iterator;
29 import java.util.List;
30 import org.modeshape.graph.JcrLexicon;
31 import org.modeshape.graph.property.Path;
32 import org.modeshape.graph.property.PathFactory;
33 import org.modeshape.graph.sequencer.SequencerOutput;
34 import org.modeshape.graph.sequencer.StreamSequencer;
35 import org.modeshape.graph.sequencer.StreamSequencerContext;
36 import org.modeshape.sequencer.msoffice.excel.ExcelMetadata;
37 import org.modeshape.sequencer.msoffice.excel.ExcelMetadataReader;
38 import org.modeshape.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
39 import org.modeshape.sequencer.msoffice.powerpoint.SlideMetadata;
40 import org.modeshape.sequencer.msoffice.word.WordMetadata;
41 import org.modeshape.sequencer.msoffice.word.WordMetadataReader;
42
43 /**
44 * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
45 * metadata to the repository.
46 * <p>
47 * This sequencer produces data that corresponds to the following structure:
48 * <ul>
49 * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
50 * <ul>
51 * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
52 * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
53 * <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
54 * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
55 * <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
56 * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
57 * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
58 * <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
59 * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
60 * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
61 * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
62 * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
63 * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
64 * <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
65 * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
66 * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
67 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
68 * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
69 * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
70 * </ul>
71 * </li>
72 * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
73 * <ul>
74 * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
75 * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
76 * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
77 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
78 * </ul>
79 * </li>
80 * </ul>
81 * </p>
82 */
83 public class MSOfficeMetadataSequencer implements StreamSequencer {
84
85 /**
86 * {@inheritDoc}
87 */
88 public void sequence( InputStream stream,
89 SequencerOutput output,
90 StreamSequencerContext context ) {
91
92 MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream);
93
94 String mimeType = context.getMimeType();
95 PathFactory pathFactory = context.getValueFactories().getPathFactory();
96 Path metadataNode = pathFactory.create(MSOfficeMetadataLexicon.METADATA_NODE);
97
98 if (metadata != null) {
99 output.setProperty(metadataNode, JcrLexicon.PRIMARY_TYPE, MSOfficeMetadataLexicon.METADATA_NODE);
100 output.setProperty(metadataNode, MSOfficeMetadataLexicon.TITLE, metadata.getTitle());
101 output.setProperty(metadataNode, MSOfficeMetadataLexicon.SUBJECT, metadata.getSubject());
102 output.setProperty(metadataNode, MSOfficeMetadataLexicon.AUTHOR, metadata.getAuthor());
103 output.setProperty(metadataNode, MSOfficeMetadataLexicon.KEYWORDS, metadata.getKeywords());
104 output.setProperty(metadataNode, MSOfficeMetadataLexicon.COMMENT, metadata.getComment());
105 output.setProperty(metadataNode, MSOfficeMetadataLexicon.TEMPLATE, metadata.getTemplate());
106 output.setProperty(metadataNode, MSOfficeMetadataLexicon.LAST_SAVED_BY, metadata.getLastSavedBy());
107 output.setProperty(metadataNode, MSOfficeMetadataLexicon.REVISION, metadata.getRevision());
108 output.setProperty(metadataNode, MSOfficeMetadataLexicon.TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
109 output.setProperty(metadataNode, MSOfficeMetadataLexicon.LAST_PRINTED, metadata.getLastPrinted());
110 output.setProperty(metadataNode, MSOfficeMetadataLexicon.CREATED, metadata.getCreated());
111 output.setProperty(metadataNode, MSOfficeMetadataLexicon.SAVED, metadata.getSaved());
112 output.setProperty(metadataNode, MSOfficeMetadataLexicon.PAGES, metadata.getPages());
113 output.setProperty(metadataNode, MSOfficeMetadataLexicon.WORDS, metadata.getWords());
114 output.setProperty(metadataNode, MSOfficeMetadataLexicon.CHARACTERS, metadata.getCharacters());
115 output.setProperty(metadataNode, MSOfficeMetadataLexicon.CREATING_APPLICATION, metadata.getCreatingApplication());
116 output.setProperty(metadataNode, MSOfficeMetadataLexicon.THUMBNAIL, metadata.getThumbnail());
117
118 } else {
119 return;
120 }
121
122 // process PowerPoint specific metadata
123 if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced
124 try {
125 List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream);
126 if (ppt_metadata != null) {
127 Path pptPath = pathFactory.create(metadataNode, MSOfficeMetadataLexicon.SLIDE);
128 for (SlideMetadata sm : ppt_metadata) {
129 output.setProperty(pptPath, MSOfficeMetadataLexicon.TITLE, sm.getTitle());
130 output.setProperty(pptPath, MSOfficeMetadataLexicon.TEXT, sm.getText());
131 output.setProperty(pptPath, MSOfficeMetadataLexicon.NOTES, sm.getNotes());
132 output.setProperty(pptPath, MSOfficeMetadataLexicon.THUMBNAIL, sm.getThumbnail());
133 }
134 }
135 } catch (IOException e) {
136 // There was an error reading, so log and continue ...
137 context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata");
138 }
139 }
140
141 if (mimeType.equals("application/vnd.ms-word")) {
142 // Sometime in the future this will sequence WORD Table of contents.
143 try {
144 WordMetadata wordMetadata = WordMetadataReader.instance(stream);
145 Path wordPath = pathFactory.create(metadataNode, MSOfficeMetadataLexicon.HEADING_NODE);
146
147 for (Iterator<WordMetadata.WordHeading> iter = wordMetadata.getHeadings().iterator(); iter.hasNext();) {
148 WordMetadata.WordHeading heading = iter.next();
149
150 output.setProperty(wordPath, MSOfficeMetadataLexicon.HEADING_NAME, heading.getText());
151 output.setProperty(wordPath, MSOfficeMetadataLexicon.HEADING_LEVEL, heading.getHeaderLevel());
152
153 }
154
155 } catch (IOException e) {
156 // There was an error reading, so log and continue ...
157 context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata");
158 }
159
160 }
161
162 if (mimeType.equals("application/vnd.ms-excel")) {
163 try {
164 ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream);
165 if (excel_metadata != null) {
166 output.setProperty(metadataNode, MSOfficeMetadataLexicon.FULL_CONTENT, excel_metadata.getText());
167 for (String sheet : excel_metadata.getSheets()) {
168 output.setProperty(metadataNode, MSOfficeMetadataLexicon.SHEET_NAME, sheet);
169 }
170 }
171 } catch (IOException e) {
172 // There was an error reading, so log and continue ...
173 context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata");
174 }
175 }
176 }
177 }