View Javadoc

1   /*
2    * ModeShape (http://www.modeshape.org)
3    * See the COPYRIGHT.txt file distributed with this work for information
4    * regarding copyright ownership.  Some portions may be licensed
5    * to Red Hat, Inc. under one or more contributor license agreements.
6   * See the AUTHORS.txt file in the distribution for a full listing of 
7   * individual contributors.
8    *
9    * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
10   * is licensed to you under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation; either version 2.1 of
12   * the License, or (at your option) any later version.
13   *
14   * ModeShape is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17   * Lesser General Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser General Public
20   * License along with this software; if not, write to the Free
21   * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
23   */
24  package org.modeshape.sequencer.msoffice;
25  
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.util.Iterator;
29  import java.util.List;
30  import org.modeshape.graph.JcrLexicon;
31  import org.modeshape.graph.property.Path;
32  import org.modeshape.graph.property.PathFactory;
33  import org.modeshape.graph.sequencer.SequencerOutput;
34  import org.modeshape.graph.sequencer.StreamSequencer;
35  import org.modeshape.graph.sequencer.StreamSequencerContext;
36  import org.modeshape.sequencer.msoffice.excel.ExcelMetadata;
37  import org.modeshape.sequencer.msoffice.excel.ExcelMetadataReader;
38  import org.modeshape.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
39  import org.modeshape.sequencer.msoffice.powerpoint.SlideMetadata;
40  import org.modeshape.sequencer.msoffice.word.WordMetadata;
41  import org.modeshape.sequencer.msoffice.word.WordMetadataReader;
42  
43  /**
44   * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
45   * metadata to the repository.
46   * <p>
47   * This sequencer produces data that corresponds to the following structure:
48   * <ul>
49   * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
50   * <ul>
51   * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
52   * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
53   * <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
54   * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
55   * <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
56   * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
57   * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
58   * <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
59   * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
60   * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
61   * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
62   * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
63   * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
64   * <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
65   * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
66   * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
67   * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
68   * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
69   * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
70   * </ul>
71   * </li>
72   * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
73   * <ul>
74   * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
75   * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
76   * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
77   * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
78   * </ul>
79   * </li>
80   * </ul>
81   * </p>
82   */
83  public class MSOfficeMetadataSequencer implements StreamSequencer {
84  
85      /**
86       * {@inheritDoc}
87       */
88      public void sequence( InputStream stream,
89                            SequencerOutput output,
90                            StreamSequencerContext context ) {
91  
92          MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream);
93  
94          String mimeType = context.getMimeType();
95          PathFactory pathFactory = context.getValueFactories().getPathFactory();
96          Path metadataNode = pathFactory.create(MSOfficeMetadataLexicon.METADATA_NODE);
97  
98          if (metadata != null) {
99              output.setProperty(metadataNode, JcrLexicon.PRIMARY_TYPE, MSOfficeMetadataLexicon.METADATA_NODE);
100             output.setProperty(metadataNode, MSOfficeMetadataLexicon.TITLE, metadata.getTitle());
101             output.setProperty(metadataNode, MSOfficeMetadataLexicon.SUBJECT, metadata.getSubject());
102             output.setProperty(metadataNode, MSOfficeMetadataLexicon.AUTHOR, metadata.getAuthor());
103             output.setProperty(metadataNode, MSOfficeMetadataLexicon.KEYWORDS, metadata.getKeywords());
104             output.setProperty(metadataNode, MSOfficeMetadataLexicon.COMMENT, metadata.getComment());
105             output.setProperty(metadataNode, MSOfficeMetadataLexicon.TEMPLATE, metadata.getTemplate());
106             output.setProperty(metadataNode, MSOfficeMetadataLexicon.LAST_SAVED_BY, metadata.getLastSavedBy());
107             output.setProperty(metadataNode, MSOfficeMetadataLexicon.REVISION, metadata.getRevision());
108             output.setProperty(metadataNode, MSOfficeMetadataLexicon.TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
109             output.setProperty(metadataNode, MSOfficeMetadataLexicon.LAST_PRINTED, metadata.getLastPrinted());
110             output.setProperty(metadataNode, MSOfficeMetadataLexicon.CREATED, metadata.getCreated());
111             output.setProperty(metadataNode, MSOfficeMetadataLexicon.SAVED, metadata.getSaved());
112             output.setProperty(metadataNode, MSOfficeMetadataLexicon.PAGES, metadata.getPages());
113             output.setProperty(metadataNode, MSOfficeMetadataLexicon.WORDS, metadata.getWords());
114             output.setProperty(metadataNode, MSOfficeMetadataLexicon.CHARACTERS, metadata.getCharacters());
115             output.setProperty(metadataNode, MSOfficeMetadataLexicon.CREATING_APPLICATION, metadata.getCreatingApplication());
116             output.setProperty(metadataNode, MSOfficeMetadataLexicon.THUMBNAIL, metadata.getThumbnail());
117 
118         } else {
119             return;
120         }
121 
122         // process PowerPoint specific metadata
123         if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced
124             try {
125                 List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream);
126                 if (ppt_metadata != null) {
127                     Path pptPath = pathFactory.create(metadataNode, MSOfficeMetadataLexicon.SLIDE);
128                     for (SlideMetadata sm : ppt_metadata) {
129                         output.setProperty(pptPath, MSOfficeMetadataLexicon.TITLE, sm.getTitle());
130                         output.setProperty(pptPath, MSOfficeMetadataLexicon.TEXT, sm.getText());
131                         output.setProperty(pptPath, MSOfficeMetadataLexicon.NOTES, sm.getNotes());
132                         output.setProperty(pptPath, MSOfficeMetadataLexicon.THUMBNAIL, sm.getThumbnail());
133                     }
134                 }
135             } catch (IOException e) {
136                 // There was an error reading, so log and continue ...
137                 context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata");
138             }
139         }
140 
141         if (mimeType.equals("application/vnd.ms-word")) {
142             // Sometime in the future this will sequence WORD Table of contents.
143             try {
144                 WordMetadata wordMetadata = WordMetadataReader.instance(stream);
145                 Path wordPath = pathFactory.create(metadataNode, MSOfficeMetadataLexicon.HEADING_NODE);
146 
147                 for (Iterator<WordMetadata.WordHeading> iter = wordMetadata.getHeadings().iterator(); iter.hasNext();) {
148                     WordMetadata.WordHeading heading = iter.next();
149 
150                     output.setProperty(wordPath, MSOfficeMetadataLexicon.HEADING_NAME, heading.getText());
151                     output.setProperty(wordPath, MSOfficeMetadataLexicon.HEADING_LEVEL, heading.getHeaderLevel());
152 
153                 }
154 
155             } catch (IOException e) {
156                 // There was an error reading, so log and continue ...
157                 context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata");
158             }
159 
160         }
161 
162         if (mimeType.equals("application/vnd.ms-excel")) {
163             try {
164                 ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream);
165                 if (excel_metadata != null) {
166                     output.setProperty(metadataNode, MSOfficeMetadataLexicon.FULL_CONTENT, excel_metadata.getText());
167                     for (String sheet : excel_metadata.getSheets()) {
168                         output.setProperty(metadataNode, MSOfficeMetadataLexicon.SHEET_NAME, sheet);
169                     }
170                 }
171             } catch (IOException e) {
172                 // There was an error reading, so log and continue ...
173                 context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata");
174             }
175         }
176     }
177 }