1
2
3
4
5
6
7
8
9
10
11
12
13
14 package net.sf.mmapps.modules.lucenesearch;
15
16 import java.io.ByteArrayInputStream;
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.io.OutputStreamWriter;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.lucene.document.DateField;
26 import org.apache.lucene.document.Field;
27 import org.mmbase.bridge.Node;
28 import org.mmbase.bridge.NodeManager;
29 import org.pdfbox.encryption.DocumentEncryption;
30 import org.pdfbox.exceptions.CryptographyException;
31 import org.pdfbox.exceptions.InvalidPasswordException;
32 import org.pdfbox.pdfparser.PDFParser;
33 import org.pdfbox.pdmodel.PDDocument;
34 import org.pdfbox.util.PDFTextStripper;
35 import org.textmining.text.extraction.WordExtractor;
36
37 /***
38 * Field object that holds the name and type of the field to be indexed
39 *
40 * @author Wouter Heijke
41 * @author R.W. van 't Veer
42 * @version $Revision: 1.1 $
43 */
44 public class DataField {
45 private static Log log = LogFactory.getLog(DataField.class);
46
47 private String name = null;
48
49 private String rename = null;
50
51 private String type = null;
52
53 private static final int FULLTEXT_DEFAULT = 0;
54
55 private static final int FULLTEXT_TRUE = 1;
56
57 private static final int FULLTEXT_FALSE = 2;
58
59 private int fulltext = FULLTEXT_DEFAULT;
60
61 private static final Set FULLTEXT_EXCLUDE_TYPES = new HashSet();
62 static {
63 FULLTEXT_EXCLUDE_TYPES.add("boolean");
64 FULLTEXT_EXCLUDE_TYPES.add("number");
65 FULLTEXT_EXCLUDE_TYPES.add("date");
66 }
67
68 private static final String DATE_TYPE = "date";
69
70 private static final String NUMBER_TYPE = "number";
71
72 private static final String BINARY_TYPE = "binary";
73
74 private static final String BOOLEAN_TYPE = "boolean";
75
76 private static final String MMBASE_MIMETYPE_FIELD = "mimetype";
77
78 private static final String MIMETYPE_PDF = "application/pdf";
79
80 private static final String MIMETYPE_MSWORD = "application/msword";
81
82 private static final String BOOLEAN_TYPE_TRUE = "1";
83
84 private static final String BOOLEAN_TYPE_FALSE = "0";
85
86 /***
87 * Collect the content from the field this object represents in a Lucene Field object
88 *
89 * @param node Node to collect fields from
90 * @return Field filled with content and ready to be indexed
91 * @throws IOException
92 */
93 protected Field collectField(Node node) throws IOException {
94 String value = null;
95
96 if (node != null) {
97 if (type != null) {
98 if (type.equalsIgnoreCase(DATE_TYPE)) {
99 int date = node.getIntValue(name);
100 if (date > 0) {
101 String dateField = DateField.timeToString(((date * 1000L)));
102 return Field.Keyword(getFieldName(), dateField);
103 }
104 } else if (type.equalsIgnoreCase(NUMBER_TYPE)) {
105 return Field.Keyword(getFieldName(), node.getStringValue(name));
106 } else if (type.equalsIgnoreCase(BOOLEAN_TYPE)) {
107 String bType = BOOLEAN_TYPE_FALSE;
108 NodeManager nm = node.getNodeManager();
109 org.mmbase.bridge.Field fld = nm.getField(name);
110 if (fld.getType() == org.mmbase.bridge.Field.TYPE_STRING) {
111 String s = node.getStringValue(name);
112 if (s != null && s.length() > 0) {
113 bType = BOOLEAN_TYPE_TRUE;
114 }
115 } else {
116 if (node.getBooleanValue(name)) {
117 bType = BOOLEAN_TYPE_TRUE;
118 }
119 }
120 return Field.Keyword(getFieldName(), bType);
121 } else if (type.equalsIgnoreCase(BINARY_TYPE)) {
122 String mimetypeCheck = node.getStringValue(MMBASE_MIMETYPE_FIELD);
123
124 if (mimetypeCheck.equalsIgnoreCase(MIMETYPE_PDF)) {
125 byte[] rawPdf = node.getByteValue(name);
126 ByteArrayOutputStream out = null;
127 PDDocument pdfDocument = null;
128 try {
129 ByteArrayInputStream input = new ByteArrayInputStream(rawPdf);
130
131 PDFParser parser = new PDFParser(input);
132 parser.parse();
133
134 pdfDocument = parser.getPDDocument();
135
136 if (pdfDocument.isEncrypted()) {
137 DocumentEncryption decryptor = new DocumentEncryption(pdfDocument);
138
139
140 decryptor.decryptDocument("");
141 }
142
143
144 out = new ByteArrayOutputStream();
145 OutputStreamWriter writer = new OutputStreamWriter(out);
146 PDFTextStripper stripper = new PDFTextStripper();
147 stripper.writeText(pdfDocument, writer);
148 writer.close();
149 out.flush();
150
151
152 value = out.toString();
153 } catch (CryptographyException e) {
154 throw new IOException("Error decrypting document(" + name + "): " + e);
155 } catch (InvalidPasswordException e) {
156
157
158 throw new IOException("Error: The document(" + name
159 + ") is encrypted and will not be indexed.");
160 } finally {
161
162 if (pdfDocument != null) {
163 out.close();
164 pdfDocument.close();
165 }
166 }
167 } else if (mimetypeCheck.equalsIgnoreCase(MIMETYPE_MSWORD)) {
168
169 byte[] rawDoc = node.getByteValue(name);
170
171 ByteArrayInputStream input = new ByteArrayInputStream(rawDoc);
172
173 try {
174 WordExtractor extractor = new WordExtractor();
175 value = extractor.extractText(input);
176 } catch (Exception e) {
177 log.error("Word extract Problem: '" + e.getMessage() + "' on: '" + name + "'");
178 }
179 } else {
180 log.warn("Unknown mimetype: '" + mimetypeCheck + "' on: '" + name + "'");
181 }
182 } else {
183 log.warn("Unknown type: '" + type + "'");
184 }
185
186 } else {
187
188 value = node.getStringValue(name);
189 }
190
191 if (value != null && value.length() > 0) {
192 return Field.Text(getFieldName(), value);
193 }
194 } else {
195 log.warn("Node missing");
196 }
197
198 return null;
199 }
200
201 /***
202 * @return The name of this Field
203 */
204 public String getName() {
205 return name;
206 }
207
208 /***
209 * @param name
210 */
211 public void setName(String name) {
212 this.name = name;
213 }
214
215 /***
216 * @return The type of this Field
217 */
218 public String getType() {
219 return type;
220 }
221
222 /***
223 * @param type
224 */
225 public void setType(String type) {
226 this.type = type;
227 }
228
229 /***
230 * @return Name of the renamed name of the current field
231 */
232 public String getRename() {
233 return rename;
234 }
235
236 /***
237 * @param name
238 */
239 public void setRename(String name) {
240 this.rename = name != null && name.trim().length() > 0 ? name : null;
241 }
242
243 /***
244 * @return Field name (renamed or plain) of this field
245 */
246 public String getFieldName() {
247 return rename == null ? name : rename;
248 }
249
250 /***
251 * @return true if this field is a FULLTEXT field
252 */
253 public boolean isFulltext() {
254 switch (fulltext) {
255 case FULLTEXT_TRUE:
256 return true;
257 case FULLTEXT_FALSE:
258 return false;
259 default:
260 return !FULLTEXT_EXCLUDE_TYPES.contains(type);
261 }
262 }
263
264 /***
265 * @param v
266 */
267 public void setFulltext(boolean v) {
268 fulltext = v ? FULLTEXT_TRUE : FULLTEXT_FALSE;
269 }
270 }