1
2
3
4
5
6
7
8
9
10 package net.sf.mmapps.commons.util;
11
12 import java.util.*;
13
14 public class KeywordUtil {
15
16 private static final String TOKENIZER_PATTERN = " \t\n\r\f.,!?;&-|";
17
18 private static final String GERMAN = "de";
19 private static final String ENGLISH = "en";
20 private static final String DUTCH = "nl";
21
22 private static String[] COMMON_ENGLISH = new String[] { "a", "about", "after", "again", "all",
23 "also", "always", "am", "an", "and", "any", "anyone", "are", "around", "as", "at",
24 "back", "be", "because", "been", "before", "being", "both", "brother", "but", "by",
25 "can", "click", "do", "does", "down", "during", "early", "ensure", "except", "few",
26 "following", "for", "from", "go", "had", "has", "have", "he", "her", "here", "him",
27 "his", "hour", "hours", "how", "i", "if", "if", "in", "into", "is", "it", "its",
28 "just", "km", "know", "later", "like", "long", "look", "lot", "many", "may", "me",
29 "months", "more", "most", "must", "my", "needed", "no", "not", "of", "often", "on",
30 "one", "only", "or", "other", "others", "our", "out", "over", "per", "pm", "quite",
31 "see", "she", "she", "should", "since", "so", "some", "something", "still", "such",
32 "sure", "than", "that", "the", "their", "them", "then", "there", "these",
33 "they", "this", "through", "time", "to", "two", "up", "us", "very", "want", "was",
34 "we", "well", "were", "what", "when", "which", "while", "who", "whom", "will", "with",
35 "within", "without", "would", "you", "your" };
36
37 private static String[] COMMON_DUTCH = new String[] { "aan", "aangaande", "aangezien",
38 "achter", "achterna", "afgelopen", "al", "aldaar", "aldus", "alhoewel", "alias",
39 "alle", "allebei", "alleen", "als", "alsnog", "altijd", "altoos", "ander", "andere",
40 "anders", "anderszins", "behalve", "behoudens", "beide", "beiden", "ben", "beneden",
41 "bent", "bepaald", "betreffende", "bij", "binnen", "binnenin", "boven", "bovenal",
42 "bovendien", "bovengenoemd", "bovenstaand", "bovenvermeld", "buiten", "co", "corp",
43 "could", "daar", "daardoor", "daarheen", "daarin", "daarna", "daarnaast", "daarnet",
44 "daarmee", "daarom", "daarop", "daarvanlangs", "dan", "dankzij", "dat", "de", "den",
45 "der", "des", "deze", "die", "dikwijls", "dit", "dl", "door", "doorgaand", "dr", "dus",
46 "echter", "ed", "een", "eer", "eerdat", "eerder", "eerlang", "eerst", "elk", "elke",
47 "en", "enig", "enige", "enigszins", "enkel", "enkele", "enz", "er", "erdoor", "ervoor",
48 "etc", "even", "eveneens", "evenwel", "gauw", "gedurende", "geen", "gehad", "gekund",
49 "geleden", "gelijk", "gemoeten", "gemogen", "geweest", "gewoon", "gewoonweg", "haar",
50 "hadden", "hare", "heb", "hebben", "hebt", "heeft", "hem", "hen", "het", "hierbeneden",
51 "hierboven", "hierin", "hij", "hoe", "hoewel", "hun", "hunne", "ik", "ikzelf", "in",
52 "inc", "inmiddels", "inzake", "is", "je", "jezelf", "jij", "jijzelf", "jou", "jouw",
53 "jouwe", "juist", "jullie", "kan", "klaar", "kon", "konden", "krachtens", "kunnen",
54 "kunt", "last", "liever", "maar", "mag", "meer", "met", "mezelf", "mij", "mijn",
55 "mijnent", "mijner", "mijzelf", "misschien", "mocht", "mochten", "moest", "moesten",
56 "moet", "moeten", "mogen", "mr", "mrs", "ms", "mz", "na", "naar", "nabij", "nadat",
57 "net", "niet", "noch", "nog", "nogal", "nu", "of", "ofschoon", "om", "omdat", "omhoog",
58 "omlaag", "omstreeks", "omtrent", "omver", "onder", "ondertussen", "ongeveer", "ons",
59 "onszelf", "onze", "ook", "op", "opnieuw", "opzij", "over", "overeind", "overigens",
60 "pas", "precies", "prof", "publ", "reeds", "rond", "rondom", "s", "says", "sedert",
61 "sinds", "sindsdien", "sl", "slechts", "sommige", "spoedig", "st", "steeds",
62 "tamelijk", "te", "tegen", "ten", "tenzij", "ter", "terwijl", "thans", "tijdens",
63 "toch", "toe", "toen", "toenmaals", "toenmalig", "tot", "totdat", "tussen", "uit",
64 "uitg", "uitgezonderd", "vaak", "vakgr", "van", "vanaf", "vandaan", "vanuit",
65 "vanwege", "veeleer", "verder", "vert", "vervolgens", "vol", "volgens", "voor",
66 "vooraf", "vooral", "vooralsnog", "voorbij", "voordat", "voordezen", "voordien",
67 "voorheen", "voorop", "vooruit", "vrij", "vroeg", "waar", "waarbij", "waarom",
68 "wanneer", "waren", "wat", "weer", "weg", "wegens", "wel", "weldra", "welk", "welke",
69 "wie", "wiens", "wier", "wij", "wijzelf", "word", "worden", "wordt", "zal", "ze",
70 "zelfs", "zich", "zichzelf", "zij", "zijn", "zijne", "zo", "zodra", "zonder", "zou",
71 "zouden", "zowat", "zulke", "zullen", "zult" };
72
73 private static String[] COMMON_GERMAN = new String[] { "am", "als", "an", "auf", "aufl", "aus",
74 "bei", "beim", "bis", "das", "dem", "den", "der", "des", "die", "dr", "du", "durch",
75 "ein", "eine", "einem", "einen", "einer", "eines", "einige", "fuer", "ihr", "ihre",
76 "ihrer", "im", "in", "mich", "mit", "nach", "of", "ohne", "prof", "seine", "sowie",
77 "tl", "ueber", "um", "und", "unter", "vert", "vom", "von", "zu", "zum", "zur" };
78
79
80 /***
81 * Convert list of keywords to comma-separated string
82 * @param keywords - list of keywords
83 * @return string of keywords
84 */
85 public static String keywordsToString(List<String> keywords) {
86 if (!keywords.isEmpty()) {
87 StringBuilder keywordsStr = new StringBuilder();
88 for (String keyword : keywords) {
89 keywordsStr.append(keyword).append(", ");
90 }
91 return keywordsStr.substring(0, keywordsStr.length() - 2);
92 }
93 return "";
94 }
95
96
97 /***
98 * Get keywords from a text. This methods tries to detect which language it is in.
99 * When the language is not detected then all words are eligable to be a keyword.
100 * @param text - full text from which the keywords should be extracted
101 * @return keywords found in text
102 */
103 public static List<String> getKeywords(String text) {
104 return getKeywords(text, Integer.MAX_VALUE);
105 }
106
107 /***
108 * Get keywords from a text
109 * @param text - full text from which the keywords should be extracted
110 * @param language - When the language is not recognized then all words
111 * are eligable to be a keyword.
112 * @return keywords found in text
113 */
114 public static List<String> getKeywords(String text, String language) {
115 return getKeywords(text, language, Integer.MAX_VALUE);
116 }
117
118 /***
119 * Get keywords from a text
120 * @param text - full text from which the keywords should be extracted
121 * @param language - When the language is not recognized then all words
122 * are eligable to be a keyword.
123 * @param max - maximum number of keywords returned
124 * @return keywords found in text
125 */
126 public static List<String> getKeywords(String text, String language, int max) {
127 if (ENGLISH.equalsIgnoreCase(language)) {
128 return getKeywordsInternal(text, COMMON_ENGLISH, max);
129 }
130 if (DUTCH.equalsIgnoreCase(language)) {
131 return getKeywordsInternal(text, COMMON_DUTCH, max);
132 }
133 if (GERMAN.equalsIgnoreCase(language)) {
134 return getKeywordsInternal(text, COMMON_GERMAN, max);
135 }
136 return getKeywordsInternal(text, new String[0], max);
137 }
138
139 /***
140 * Get keywords from a text. This methods tries to detect which language it is in.
141 * When the language is not detected then all words are eligable to be a keyword.
142 * @param text - full text from which the keywords should be extracted
143 * @param max - maximum number of keywords returned
144 * @return keywords found in text
145 */
146 public static List<String> getKeywords(String text, int max) {
147 String[] ignoreWords = getCommonWords(text);
148 return getKeywordsInternal(text, ignoreWords, max);
149 }
150
151 /***
152 * Get keywords from a text
153 * @param text - full text from which the keywords should be extracted
154 * @param ignoreWords - ignore the words in this array
155 * @param max - maximum number of keywords returned
156 * @return keywords found in text
157 */
158 public static List<String> getKeywords(String text, String[] ignoreWords, int max) {
159 Arrays.sort(ignoreWords);
160 return getKeywordsInternal(text, ignoreWords, max);
161 }
162
163 /***
164 * Get keywords from the strings. This methods tries to detect which language it is in.
165 * When the language is not detected then all words are eligable to be a keyword.
166 * @param textStrings - strings from which the keywords should be extracted
167 * @return keywords found in strings
168 */
169 public static List<String> getKeywords(List<String> textStrings) {
170 return getKeywords(textStrings, Integer.MAX_VALUE);
171 }
172
173 /***
174 * Get keywords from the strings
175 * @param textStrings - strings from which the keywords should be extracted
176 * @param language - When the language is not recognized then all words
177 * are eligable to be a keyword.
178 * @return keywords found in strings
179 */
180 public static List<String> getKeywords(List<String> textStrings, String language) {
181 return getKeywords(textStrings, language, Integer.MAX_VALUE);
182 }
183
184 /***
185 * Get keywords from the strings
186 * @param textStrings - strings from which the keywords should be extracted
187 * @param language - When the language is not recognized then all words
188 * are eligable to be a keyword.
189 * @param max - maximum number of keywords returned
190 * @return keywords found in strings
191 */
192 public static List<String> getKeywords(List<String> textStrings, String language, int max) {
193 if (ENGLISH.equalsIgnoreCase(language)) {
194 return getKeywordsInternal(textStrings, COMMON_ENGLISH, max);
195 }
196 if (DUTCH.equalsIgnoreCase(language)) {
197 return getKeywordsInternal(textStrings, COMMON_DUTCH, max);
198 }
199 if (GERMAN.equalsIgnoreCase(language)) {
200 return getKeywordsInternal(textStrings, COMMON_GERMAN, max);
201 }
202 return getKeywordsInternal(textStrings, new String[0], max);
203 }
204
205 /***
206 * Get keywords from the strings. This methods tries to detect which language it is in.
207 * When the language is not detected then all words are eligable to be a keyword.
208 * @param textStrings - strings from which the keywords should be extracted
209 * @param max - maximum number of keywords returned
210 * @return keywords found in strings
211 */
212 public static List<String> getKeywords(List<String> textStrings, int max) {
213 String[] ignoreWords = null;
214 for (String text : textStrings) {
215 ignoreWords = getCommonWords(text);
216 if (ignoreWords.length > 0) {
217 break;
218 }
219 }
220 return getKeywordsInternal(textStrings, ignoreWords, max);
221 }
222
223 /***
224 * Get keywords from the strings
225 * @param textStrings - strings from which the keywords should be extracted
226 * @param ignoreWords - ignore the words in this array
227 * @param max - maximum number of keywords returned
228 * @return keywords found in strings
229 */
230 public static List<String> getKeywords(List<String> textStrings, String[] ignoreWords, int max) {
231 Arrays.sort(ignoreWords);
232 return getKeywordsInternal(textStrings, ignoreWords, max);
233 }
234
235 /***
236 * Get keywords from a text
237 * @param text - full text from which the keywords should be extracted
238 * @param ignoreWords - ignore the words in this array
239 * @param max - maximum number of keywords returned
240 * @return keywords found in text
241 */
242 private static List<String> getKeywordsInternal(String text, String[] ignoreWords, int max) {
243 List<String> textStrings = new ArrayList<String>();
244 textStrings.add(text);
245 return getKeywordsInternal(textStrings, ignoreWords, max);
246 }
247
248 /***
249 * Get keywords from the strings
250 * @param textStrings - strings from which the keywords should be extracted
251 * @param ignoreWords - ignore the words in this array
252 * @param max - maximum number of keywords returned
253 * @return keywords found in text
254 */
255 private static List<String> getKeywordsInternal(List<String> textStrings, String[] ignoreWords, int max) {
256 Map<String,Keyword> keywords = new HashMap<String,Keyword>();
257
258 for (String text : textStrings) {
259 StringTokenizer tokenizer = new StringTokenizer(text, TOKENIZER_PATTERN);
260 while (tokenizer.hasMoreTokens()) {
261 String word = tokenizer.nextToken().toLowerCase();
262 if (word.length() > 1 && Arrays.binarySearch(ignoreWords, word) < 0) {
263 Keyword keyword = null;
264 if (keywords.containsKey(word)) {
265 keyword = keywords.get(word);
266 }
267 else {
268 keyword = new Keyword(word);
269 keywords.put(word, keyword);
270 }
271 keyword.inc();
272 }
273 }
274 }
275
276 List<Keyword> sortList = new ArrayList<Keyword>(keywords.values());
277 Collections.sort(sortList);
278 List<String> words = new ArrayList<String>(sortList.size());
279
280 int end = Math.min(max, sortList.size());
281 for (int i = 0; i < end; i++) {
282 words.add(sortList.get(i).key);
283 }
284 return words;
285 }
286
287 public static String[] getCommonWords(String text) {
288 int dutch = 0;
289 int english = 0;
290 int german = 0;
291
292 StringTokenizer tokenizer = new StringTokenizer(text, TOKENIZER_PATTERN);
293 while (tokenizer.hasMoreTokens()) {
294 String word = tokenizer.nextToken().toLowerCase();
295 if (Arrays.binarySearch(COMMON_DUTCH, word) >= 0) {
296 dutch++;
297 }
298 if (Arrays.binarySearch(COMMON_ENGLISH, word) >= 0) {
299 english++;
300 }
301 if (Arrays.binarySearch(COMMON_GERMAN, word) >= 0) {
302 german++;
303 }
304 }
305 if (dutch > english && dutch > german) {
306 return COMMON_DUTCH;
307 }
308 if (english > dutch && english > german) {
309 return COMMON_ENGLISH;
310 }
311 if (german > dutch && german > english) {
312 return COMMON_ENGLISH;
313 }
314 if (dutch > 0) {
315
316 return COMMON_DUTCH;
317 }
318 return new String[0];
319 }
320
321 public static String detectLanguage(String text) {
322 String[] commonWords = getCommonWords(text);
323 if (commonWords == COMMON_DUTCH) {
324 return DUTCH;
325 }
326 if (commonWords == COMMON_ENGLISH) {
327 return ENGLISH;
328 }
329 if (commonWords == COMMON_GERMAN) {
330 return GERMAN;
331 }
332 return null;
333 }
334
335 private static class Keyword implements Comparable<Keyword> {
336 String key;
337 int count;
338
339 Keyword(String key) {
340 if (key == null) {
341 throw new IllegalArgumentException("key is empty");
342 }
343 this.key = key;
344 this.count = 0;
345 }
346
347 public void inc() {
348 this.count++;
349 }
350
351 public int hashCode() {
352 return key.hashCode();
353 }
354
355 public boolean equals(Object obj) {
356 if (obj == null) return false;
357 if (obj == this) return true;
358 if (obj instanceof Keyword) {
359 return key.equals(((Keyword)obj).key);
360 }
361 return false;
362 }
363
364 public int compareTo(Keyword o) {
365 return o.count - count;
366 }
367 }
368
369 }