KeywordUtil xref

View Javadoc

1   /*
2   
3   This software is OSI Certified Open Source Software.
4   OSI Certified is a certification mark of the Open Source Initiative.
5   
6   The license (Mozilla version 1.0) can be read at the MMBase site.
7   See http://www.MMBase.org/license
8   
9   */
10  package net.sf.mmapps.commons.util;
11  
12  import java.util.*;
13  
14  public class KeywordUtil {
15  
16      private static final String TOKENIZER_PATTERN = " \t\n\r\f.,!?;&-|";
17  
18      private static final String GERMAN = "de";
19      private static final String ENGLISH = "en";
20      private static final String DUTCH = "nl";
21  
22      private static String[] COMMON_ENGLISH = new String[] { "a", "about", "after", "again", "all",
23              "also", "always", "am", "an", "and", "any", "anyone", "are", "around", "as", "at",
24              "back", "be", "because", "been", "before", "being", "both", "brother", "but", "by",
25              "can", "click", "do", "does", "down", "during", "early", "ensure", "except", "few",
26              "following", "for", "from", "go", "had", "has", "have", "he", "her", "here", "him",
27              "his", "hour", "hours", "how", "i", "if", "if", "in", "into", "is", "it", "its",
28              "just", "km", "know", "later", "like", "long", "look", "lot", "many", "may", "me",
29              "months", "more", "most", "must", "my", "needed", "no", "not", "of", "often", "on",
30              "one", "only", "or", "other", "others", "our", "out", "over", "per", "pm", "quite",
31              "see", "she", "she", "should", "since", "so", "some", "something", "still", "such",
32              "sure", "than", "that", "the", "their", "them", "then", "there", "these",
33              "they", "this", "through", "time", "to", "two", "up", "us", "very", "want", "was",
34              "we", "well", "were", "what", "when", "which", "while", "who", "whom", "will", "with",
35              "within", "without", "would", "you", "your" };
36      
37      private static String[] COMMON_DUTCH = new String[] { "aan", "aangaande", "aangezien",
38              "achter", "achterna", "afgelopen", "al", "aldaar", "aldus", "alhoewel", "alias",
39              "alle", "allebei", "alleen", "als", "alsnog", "altijd", "altoos", "ander", "andere",
40              "anders", "anderszins", "behalve", "behoudens", "beide", "beiden", "ben", "beneden",
41              "bent", "bepaald", "betreffende", "bij", "binnen", "binnenin", "boven", "bovenal",
42              "bovendien", "bovengenoemd", "bovenstaand", "bovenvermeld", "buiten", "co", "corp",
43              "could", "daar", "daardoor", "daarheen", "daarin", "daarna", "daarnaast", "daarnet",
44              "daarmee", "daarom", "daarop", "daarvanlangs", "dan", "dankzij", "dat", "de", "den",
45              "der", "des", "deze", "die", "dikwijls", "dit", "dl", "door", "doorgaand", "dr", "dus",
46              "echter", "ed", "een", "eer", "eerdat", "eerder", "eerlang", "eerst", "elk", "elke",
47              "en", "enig", "enige", "enigszins", "enkel", "enkele", "enz", "er", "erdoor", "ervoor",
48              "etc", "even", "eveneens", "evenwel", "gauw", "gedurende", "geen", "gehad", "gekund",
49              "geleden", "gelijk", "gemoeten", "gemogen", "geweest", "gewoon", "gewoonweg", "haar",
50              "hadden", "hare", "heb", "hebben", "hebt", "heeft", "hem", "hen", "het", "hierbeneden",
51              "hierboven", "hierin", "hij", "hoe", "hoewel", "hun", "hunne", "ik", "ikzelf", "in",
52              "inc", "inmiddels", "inzake", "is", "je", "jezelf", "jij", "jijzelf", "jou", "jouw",
53              "jouwe", "juist", "jullie", "kan", "klaar", "kon", "konden", "krachtens", "kunnen",
54              "kunt", "last", "liever", "maar", "mag", "meer", "met", "mezelf", "mij", "mijn",
55              "mijnent", "mijner", "mijzelf", "misschien", "mocht", "mochten", "moest", "moesten",
56              "moet", "moeten", "mogen", "mr", "mrs", "ms", "mz", "na", "naar", "nabij", "nadat",
57              "net", "niet", "noch", "nog", "nogal", "nu", "of", "ofschoon", "om", "omdat", "omhoog",
58              "omlaag", "omstreeks", "omtrent", "omver", "onder", "ondertussen", "ongeveer", "ons",
59              "onszelf", "onze", "ook", "op", "opnieuw", "opzij", "over", "overeind", "overigens",
60              "pas", "precies", "prof", "publ", "reeds", "rond", "rondom", "s", "says", "sedert",
61              "sinds", "sindsdien", "sl", "slechts", "sommige", "spoedig", "st", "steeds",
62              "tamelijk", "te", "tegen", "ten", "tenzij", "ter", "terwijl", "thans", "tijdens",
63              "toch", "toe", "toen", "toenmaals", "toenmalig", "tot", "totdat", "tussen", "uit",
64              "uitg", "uitgezonderd", "vaak", "vakgr", "van", "vanaf", "vandaan", "vanuit",
65              "vanwege", "veeleer", "verder", "vert", "vervolgens", "vol", "volgens", "voor",
66              "vooraf", "vooral", "vooralsnog", "voorbij", "voordat", "voordezen", "voordien",
67              "voorheen", "voorop", "vooruit", "vrij", "vroeg", "waar", "waarbij", "waarom",
68              "wanneer", "waren", "wat", "weer", "weg", "wegens", "wel", "weldra", "welk", "welke",
69              "wie", "wiens", "wier", "wij", "wijzelf", "word", "worden", "wordt", "zal", "ze",
70              "zelfs", "zich", "zichzelf", "zij", "zijn", "zijne", "zo", "zodra", "zonder", "zou",
71              "zouden", "zowat", "zulke", "zullen", "zult" };
72  
73      private static String[] COMMON_GERMAN = new String[] { "am", "als", "an", "auf", "aufl", "aus",
74              "bei", "beim", "bis", "das", "dem", "den", "der", "des", "die", "dr", "du", "durch",
75              "ein", "eine", "einem", "einen", "einer", "eines", "einige", "fuer", "ihr", "ihre",
76              "ihrer", "im", "in", "mich", "mit", "nach", "of", "ohne", "prof", "seine", "sowie",
77              "tl", "ueber", "um", "und", "unter", "vert", "vom", "von", "zu", "zum", "zur" };
78  
79      
80      /***
81       * Convert list of keywords to comma-separated string
82       * @param keywords - list of keywords
83       * @return string of keywords
84       */
85      public static String keywordsToString(List<String> keywords) {
86          if (!keywords.isEmpty()) {
87              StringBuilder keywordsStr = new StringBuilder();
88              for (String keyword : keywords) {
89                  keywordsStr.append(keyword).append(", ");
90              }
91              return keywordsStr.substring(0, keywordsStr.length() - 2);
92          }
93          return "";
94      }
95  
96  
97      /***
98       * Get keywords from a text. This methods tries to detect which language it is in.
99       * When the language is not detected then all words are eligable to be a keyword.
100      * @param text - full text from which the keywords should be extracted
101      * @return keywords found in text
102      */
103     public static List<String> getKeywords(String text) {
104         return getKeywords(text, Integer.MAX_VALUE);
105     }
106 
107     /***
108      * Get keywords from a text
109      * @param text - full text from which the keywords should be extracted
110      * @param language - When the language is not recognized then all words 
111      *                  are eligable to be a keyword.
112      * @return keywords found in text
113      */
114     public static List<String> getKeywords(String text, String language) {
115         return getKeywords(text, language, Integer.MAX_VALUE);
116     }
117 
118     /***
119      * Get keywords from a text
120      * @param text - full text from which the keywords should be extracted
121      * @param language - When the language is not recognized then all words 
122      *                  are eligable to be a keyword.
123      * @param max - maximum number of keywords returned
124      * @return keywords found in text
125      */
126     public static List<String> getKeywords(String text, String language, int max) {
127         if (ENGLISH.equalsIgnoreCase(language)) {
128             return getKeywordsInternal(text, COMMON_ENGLISH, max);
129         }
130         if (DUTCH.equalsIgnoreCase(language)) {
131             return getKeywordsInternal(text, COMMON_DUTCH, max);
132         }
133         if (GERMAN.equalsIgnoreCase(language)) {
134             return getKeywordsInternal(text, COMMON_GERMAN, max);
135         }
136         return getKeywordsInternal(text, new String[0], max);
137     }
138     
139     /***
140      * Get keywords from a text. This methods tries to detect which language it is in.
141      * When the language is not detected then all words are eligable to be a keyword.
142      * @param text - full text from which the keywords should be extracted
143      * @param max - maximum number of keywords returned
144      * @return keywords found in text
145      */
146     public static List<String> getKeywords(String text, int max) {
147         String[] ignoreWords = getCommonWords(text);
148         return getKeywordsInternal(text, ignoreWords, max);
149     }
150 
151     /***
152      * Get keywords from a text
153      * @param text - full text from which the keywords should be extracted
154      * @param ignoreWords - ignore the words in this array
155      * @param max - maximum number of keywords returned
156      * @return keywords found in text
157      */
158     public static List<String> getKeywords(String text, String[] ignoreWords, int max) {
159         Arrays.sort(ignoreWords);
160         return getKeywordsInternal(text, ignoreWords, max);
161     }
162 
163     /***
164      * Get keywords from the strings. This methods tries to detect which language it is in.
165      * When the language is not detected then all words are eligable to be a keyword.
166      * @param textStrings - strings from which the keywords should be extracted
167      * @return keywords found in strings
168      */
169     public static List<String> getKeywords(List<String> textStrings) {
170         return getKeywords(textStrings, Integer.MAX_VALUE);
171     }
172 
173     /***
174      * Get keywords from the strings
175      * @param textStrings - strings from which the keywords should be extracted
176      * @param language - When the language is not recognized then all words 
177      *                  are eligable to be a keyword.
178      * @return keywords found in strings
179      */
180     public static List<String> getKeywords(List<String> textStrings, String language) {
181         return getKeywords(textStrings, language, Integer.MAX_VALUE);
182     }
183 
184     /***
185      * Get keywords from the strings
186      * @param textStrings - strings from which the keywords should be extracted
187      * @param language - When the language is not recognized then all words 
188      *                  are eligable to be a keyword.
189      * @param max - maximum number of keywords returned
190      * @return keywords found in strings
191      */
192     public static List<String> getKeywords(List<String> textStrings, String language, int max) {
193         if (ENGLISH.equalsIgnoreCase(language)) {
194             return getKeywordsInternal(textStrings, COMMON_ENGLISH, max);
195         }
196         if (DUTCH.equalsIgnoreCase(language)) {
197             return getKeywordsInternal(textStrings, COMMON_DUTCH, max);
198         }
199         if (GERMAN.equalsIgnoreCase(language)) {
200             return getKeywordsInternal(textStrings, COMMON_GERMAN, max);
201         }
202         return getKeywordsInternal(textStrings, new String[0], max);
203     }
204     
205     /***
206      * Get keywords from the strings. This methods tries to detect which language it is in.
207      * When the language is not detected then all words are eligable to be a keyword.
208      * @param textStrings - strings from which the keywords should be extracted
209      * @param max - maximum number of keywords returned
210      * @return keywords found in strings
211      */
212     public static List<String> getKeywords(List<String> textStrings, int max) {
213         String[] ignoreWords = null;
214         for (String text : textStrings) {
215             ignoreWords = getCommonWords(text);
216             if (ignoreWords.length > 0) {
217                 break;
218             }
219         }
220         return getKeywordsInternal(textStrings, ignoreWords, max);
221     }
222 
223     /***
224      * Get keywords from the strings
225      * @param textStrings - strings from which the keywords should be extracted
226      * @param ignoreWords - ignore the words in this array
227      * @param max - maximum number of keywords returned
228      * @return keywords found in strings
229      */
230     public static List<String> getKeywords(List<String> textStrings, String[] ignoreWords, int max) {
231         Arrays.sort(ignoreWords);
232         return getKeywordsInternal(textStrings, ignoreWords, max);
233     }
234     
235     /***
236      * Get keywords from a text
237      * @param text - full text from which the keywords should be extracted
238      * @param ignoreWords - ignore the words in this array
239      * @param max - maximum number of keywords returned
240      * @return keywords found in text
241      */
242     private static List<String> getKeywordsInternal(String text, String[] ignoreWords, int max) {
243         List<String> textStrings = new ArrayList<String>();
244         textStrings.add(text);
245         return getKeywordsInternal(textStrings, ignoreWords, max);
246     }
247 
248     /***
249      * Get keywords from the strings
250      * @param textStrings - strings from which the keywords should be extracted
251      * @param ignoreWords - ignore the words in this array
252      * @param max - maximum number of keywords returned
253      * @return keywords found in text
254      */
255     private static List<String> getKeywordsInternal(List<String> textStrings, String[] ignoreWords, int max) {
256         Map<String,Keyword> keywords = new HashMap<String,Keyword>();
257 
258         for (String text : textStrings) {
259             StringTokenizer tokenizer = new StringTokenizer(text, TOKENIZER_PATTERN);
260             while (tokenizer.hasMoreTokens()) {
261                 String word = tokenizer.nextToken().toLowerCase();
262                 if (word.length() > 1 && Arrays.binarySearch(ignoreWords, word) < 0) {
263                     Keyword keyword = null;
264                     if (keywords.containsKey(word)) {
265                         keyword = keywords.get(word);
266                     }
267                     else {
268                         keyword = new Keyword(word);
269                         keywords.put(word, keyword);
270                     }
271                     keyword.inc();
272                 }
273             }
274         }
275         
276         List<Keyword> sortList = new ArrayList<Keyword>(keywords.values());
277         Collections.sort(sortList);
278         List<String> words = new ArrayList<String>(sortList.size());
279         
280         int end = Math.min(max, sortList.size());
281         for (int i = 0; i < end; i++) {
282             words.add(sortList.get(i).key);
283         }
284         return words; 
285     }
286 
287     public static String[] getCommonWords(String text) {
288         int dutch = 0;
289         int english = 0;
290         int german = 0;
291         
292         StringTokenizer tokenizer = new StringTokenizer(text, TOKENIZER_PATTERN);
293         while (tokenizer.hasMoreTokens()) {
294             String word = tokenizer.nextToken().toLowerCase();
295             if (Arrays.binarySearch(COMMON_DUTCH, word) >= 0) {
296                 dutch++;
297             }
298             if (Arrays.binarySearch(COMMON_ENGLISH, word) >= 0) {
299                 english++;
300             }
301             if (Arrays.binarySearch(COMMON_GERMAN, word) >= 0) {
302                 german++;
303             }
304         }
305         if (dutch > english && dutch > german) {
306             return COMMON_DUTCH;
307         }
308         if (english > dutch && english > german) {
309             return COMMON_ENGLISH;
310         }
311         if (german > dutch && german > english) {
312             return COMMON_ENGLISH;
313         }
314         if (dutch > 0) {
315             // We are mainly a dutch community. 
316             return COMMON_DUTCH;
317         }
318         return new String[0];
319     }
320     
321     public static String detectLanguage(String text) {
322         String[] commonWords = getCommonWords(text);
323         if (commonWords == COMMON_DUTCH) {
324             return DUTCH;
325         }
326         if (commonWords == COMMON_ENGLISH) {
327             return ENGLISH;
328         }
329         if (commonWords == COMMON_GERMAN) {
330             return GERMAN;
331         }
332         return null;
333     }
334     
335     private static class Keyword implements Comparable<Keyword> {
336         String key;
337         int count;
338         
339         Keyword(String key) {
340             if (key == null) {
341                 throw new IllegalArgumentException("key is empty");
342             }
343             this.key = key;
344             this.count = 0;
345         }
346         
347         public void inc() {
348             this.count++;
349         }
350 
351         public int hashCode() {
352             return key.hashCode();
353         }
354         
355         public boolean equals(Object obj) {
356             if (obj == null) return false;
357             if (obj == this) return true;
358             if (obj instanceof Keyword) {
359                 return key.equals(((Keyword)obj).key);
360             }
361             return false;
362         }
363         
364         public int compareTo(Keyword o) {
365             return o.count - count;
366         }
367     }
368 
369 }