View Javadoc
1   /*
2    * MMBase Lucene module
3    *
4    * The contents of this file are subject to the Mozilla Public License
5    * Version 1.0 (the "License"); you may not use this file except in
6    * compliance with the License. You may obtain a copy of the License at
7    * http://www.mozilla.org/MPL/
8    *
9    * Software distributed under the License is distributed on an "AS IS"
10   * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
11   * License for the specific language governing rights and limitations
12   * under the License.
13   */
14  package net.sf.mmapps.modules.lucenesearch;
15  
16  import java.io.ByteArrayInputStream;
17  import java.io.ByteArrayOutputStream;
18  import java.io.IOException;
19  import java.io.OutputStreamWriter;
20  import java.util.HashSet;
21  import java.util.Set;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.lucene.document.DateField;
26  import org.apache.lucene.document.Field;
27  import org.mmbase.bridge.Node;
28  import org.mmbase.bridge.NodeManager;
29  import org.pdfbox.encryption.DocumentEncryption;
30  import org.pdfbox.exceptions.CryptographyException;
31  import org.pdfbox.exceptions.InvalidPasswordException;
32  import org.pdfbox.pdfparser.PDFParser;
33  import org.pdfbox.pdmodel.PDDocument;
34  import org.pdfbox.util.PDFTextStripper;
35  import org.textmining.text.extraction.WordExtractor;
36  
37  /***
38   * Field object that holds the name and type of the field to be indexed
39   * 
40   * @author Wouter Heijke
41   * @author R.W. van 't Veer
42   * @version $Revision: 1.1 $
43   */
44  public class DataField {
45      private static Log log = LogFactory.getLog(DataField.class);
46  
47      private String name = null;
48  
49      private String rename = null;
50  
51      private String type = null;
52  
53      private static final int FULLTEXT_DEFAULT = 0;
54  
55      private static final int FULLTEXT_TRUE = 1;
56  
57      private static final int FULLTEXT_FALSE = 2;
58  
59      private int fulltext = FULLTEXT_DEFAULT;
60  
61      private static final Set FULLTEXT_EXCLUDE_TYPES = new HashSet();
62      static {
63          FULLTEXT_EXCLUDE_TYPES.add("boolean");
64          FULLTEXT_EXCLUDE_TYPES.add("number");
65          FULLTEXT_EXCLUDE_TYPES.add("date");
66      }
67  
68      private static final String DATE_TYPE = "date";
69  
70      private static final String NUMBER_TYPE = "number";
71  
72      private static final String BINARY_TYPE = "binary";
73  
74      private static final String BOOLEAN_TYPE = "boolean";
75  
76      private static final String MMBASE_MIMETYPE_FIELD = "mimetype";
77  
78      private static final String MIMETYPE_PDF = "application/pdf";
79  
80      private static final String MIMETYPE_MSWORD = "application/msword";
81  
82      private static final String BOOLEAN_TYPE_TRUE = "1";
83  
84      private static final String BOOLEAN_TYPE_FALSE = "0";
85  
86      /***
87       * Collect the content from the field this object represents in a Lucene Field object
88       * 
89       * @param node Node to collect fields from
90       * @return Field filled with content and ready to be indexed
91       * @throws IOException
92       */
93      protected Field collectField(Node node) throws IOException {
94          String value = null;
95  
96          if (node != null) {
97              if (type != null) {
98                  if (type.equalsIgnoreCase(DATE_TYPE)) {
99                      int date = node.getIntValue(name);
100                     if (date > 0) {
101                         String dateField = DateField.timeToString(((date * 1000L)));
102                         return Field.Keyword(getFieldName(), dateField);
103                     }
104                 } else if (type.equalsIgnoreCase(NUMBER_TYPE)) {
105                     return Field.Keyword(getFieldName(), node.getStringValue(name));
106                 } else if (type.equalsIgnoreCase(BOOLEAN_TYPE)) {
107                     String bType = BOOLEAN_TYPE_FALSE;
108                     NodeManager nm = node.getNodeManager();
109                     org.mmbase.bridge.Field fld = nm.getField(name);
110                     if (fld.getType() == org.mmbase.bridge.Field.TYPE_STRING) {
111                         String s = node.getStringValue(name);
112                         if (s != null && s.length() > 0) {
113                             bType = BOOLEAN_TYPE_TRUE;
114                         }
115                     } else {
116                         if (node.getBooleanValue(name)) {
117                             bType = BOOLEAN_TYPE_TRUE;
118                         }
119                     }
120                     return Field.Keyword(getFieldName(), bType);
121                 } else if (type.equalsIgnoreCase(BINARY_TYPE)) {
122                     String mimetypeCheck = node.getStringValue(MMBASE_MIMETYPE_FIELD);
123 
124                     if (mimetypeCheck.equalsIgnoreCase(MIMETYPE_PDF)) {
125                         byte[] rawPdf = node.getByteValue(name);
126                         ByteArrayOutputStream out = null;
127                         PDDocument pdfDocument = null;
128                         try {
129                             ByteArrayInputStream input = new ByteArrayInputStream(rawPdf);
130 
131                             PDFParser parser = new PDFParser(input);
132                             parser.parse();
133 
134                             pdfDocument = parser.getPDDocument();
135 
136                             if (pdfDocument.isEncrypted()) {
137                                 DocumentEncryption decryptor = new DocumentEncryption(pdfDocument);
138                                 // Just try using the default password and move
139                                 // on
140                                 decryptor.decryptDocument("");
141                             }
142                             // create a tmp output stream with the size of the
143                             // content
144                             out = new ByteArrayOutputStream();
145                             OutputStreamWriter writer = new OutputStreamWriter(out);
146                             PDFTextStripper stripper = new PDFTextStripper();
147                             stripper.writeText(pdfDocument, writer);
148                             writer.close();
149                             out.flush();
150 
151                             // get the return value
152                             value = out.toString();
153                         } catch (CryptographyException e) {
154                             throw new IOException("Error decrypting document(" + name + "): " + e);
155                         } catch (InvalidPasswordException e) {
156                             // they didn't suppply a password and the default of
157                             // "" was wrong.
158                             throw new IOException("Error: The document(" + name
159                                     + ") is encrypted and will not be indexed.");
160                         } finally {
161                             // cleanup to return clean
162                             if (pdfDocument != null) {
163                                 out.close();
164                                 pdfDocument.close();
165                             }
166                         }
167                     } else if (mimetypeCheck.equalsIgnoreCase(MIMETYPE_MSWORD)) {
168 
169                         byte[] rawDoc = node.getByteValue(name);
170 
171                         ByteArrayInputStream input = new ByteArrayInputStream(rawDoc);
172 
173                         try {
174                             WordExtractor extractor = new WordExtractor();
175                             value = extractor.extractText(input);
176                         } catch (Exception e) {
177                             log.error("Word extract Problem: '" + e.getMessage() + "' on: '" + name + "'");
178                         }
179                     } else {
180                         log.warn("Unknown mimetype: '" + mimetypeCheck + "' on: '" + name + "'");
181                     }
182                 } else {
183                     log.warn("Unknown type: '" + type + "'");
184                 }
185 
186             } else {
187                 // log.info("Field name: '"+name+"'");
188                 value = node.getStringValue(name);
189             }
190 
191             if (value != null && value.length() > 0) {
192                 return Field.Text(getFieldName(), value);
193             }
194         } else {
195             log.warn("Node missing");
196         }
197 
198         return null;
199     }
200 
201     /***
202      * @return The name of this Field
203      */
204     public String getName() {
205         return name;
206     }
207 
208     /***
209      * @param name
210      */
211     public void setName(String name) {
212         this.name = name;
213     }
214 
215     /***
216      * @return The type of this Field
217      */
218     public String getType() {
219         return type;
220     }
221 
222     /***
223      * @param type
224      */
225     public void setType(String type) {
226         this.type = type;
227     }
228 
229     /***
230      * @return Name of the renamed name of the current field
231      */
232     public String getRename() {
233         return rename;
234     }
235 
236     /***
237      * @param name
238      */
239     public void setRename(String name) {
240         this.rename = name != null && name.trim().length() > 0 ? name : null;
241     }
242 
243     /***
244      * @return Field name (renamed or plain) of this field
245      */
246     public String getFieldName() {
247         return rename == null ? name : rename;
248     }
249 
250     /***
251      * @return true if this field is a FULLTEXT field
252      */
253     public boolean isFulltext() {
254         switch (fulltext) {
255         case FULLTEXT_TRUE:
256             return true;
257         case FULLTEXT_FALSE:
258             return false;
259         default:
260             return !FULLTEXT_EXCLUDE_TYPES.contains(type);
261         }
262     }
263 
264     /***
265      * @param v
266      */
267     public void setFulltext(boolean v) {
268         fulltext = v ? FULLTEXT_TRUE : FULLTEXT_FALSE;
269     }
270 }