View Javadoc

1   /*
2   
3   This software is OSI Certified Open Source Software.
4   OSI Certified is a certification mark of the Open Source Initiative.
5   
6   The license (Mozilla version 1.0) can be read at the MMBase site.
7   See http://www.MMBase.org/license
8   
9   */
10  package net.sf.mmapps.commons.util;
11  
12  import java.text.Collator;
13  import java.util.*;
14  
15  
16  public class EncodingUtil {
17  
18      private EncodingUtil() {
19          // utility class
20      }
21      
22      private static final String PLAIN_ASCII =
23          "AaEeIiOoUu"    // grave
24        + "AaEeIiOoUuYy"  // acute
25        + "AaEeIiOoUuYy"  // circumflex
26        + "AaEeIiOoUuYy"  // tilde
27        + "AaEeIiOoUuYy"  // umlaut
28        + "Aa"            // ring
29        + "Cc"            // cedilla
30        ;
31  
32      private static final String UNICODE =
33          "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9"             // grave
34        + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" // acute
35        + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" // circumflex
36        + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" // tilde
37        + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" // umlaut
38        + "\u00C5\u00E5"                                                             // ring
39        + "\u00C7\u00E7"                                                             // cedilla
40        ;
41  
42        // remove accentued from a string and replace with ascii equivalent
43        public static String convertNonAscii(String s) {
44           StringBuffer sb = new StringBuffer();
45           int n = s.length();
46           for (int i = 0; i < n; i++) {
47              char c = s.charAt(i);
48              int pos = UNICODE.indexOf(c);
49              if (pos > -1){
50                  sb.append(PLAIN_ASCII.charAt(pos));
51              }
52              else {
53                  sb.append(c);
54              }
55           }
56           return sb.toString();
57        }
58  
59      /***
60       * the byte value of a character >= 128 && <= 159 is unused in ISO-8859-1 but it is used by
61       * Windows Central Europe encoding, so we can detect it and transcode some of it's
62       * characters.
63       */
64      static Map<Character,String> w1252ToISO;
65      static {
66          w1252ToISO = new HashMap<Character,String>();
67          w1252ToISO.put(new Character('\u0080'), "Euro");
68          w1252ToISO.put(new Character('\u0082'), ",");
69          w1252ToISO.put(new Character('\u0083'), "f");
70          w1252ToISO.put(new Character('\u0085'), "...");
71          w1252ToISO.put(new Character('\u0088'), "^");
72          w1252ToISO.put(new Character('\u008B'), "<");
73          w1252ToISO.put(new Character('\u008C'), "OE");
74          w1252ToISO.put(new Character('\u0091'), "'");
75          w1252ToISO.put(new Character('\u0092'), "'");
76          w1252ToISO.put(new Character('\u0093'), "\"");
77          w1252ToISO.put(new Character('\u0094'), "\"");
78          w1252ToISO.put(new Character('\u0095'), ".");
79          w1252ToISO.put(new Character('\u0096'), "-");
80          w1252ToISO.put(new Character('\u0097'), "-");
81          w1252ToISO.put(new Character('\u0098'), "~");
82          w1252ToISO.put(new Character('\u009B'), ">");
83          w1252ToISO.put(new Character('\u009C'), "oe");
84      }
85  
86      public static String windows1252ToISO(String windows1252encoded) {
87          StringBuffer transcodedValue = new StringBuffer(windows1252encoded.length());
88          for (int i = 0; i < windows1252encoded.length(); i++) {
89              char curChar = windows1252encoded.charAt(i);
90              if ((curChar >= 128) && (curChar <= 159)) {
91                  // this range of characters is unused in ISO-8859-1 but
92                  // is used by Windows Central Europe encoding, so we can
93                  // detect it and transcode some of it's characters.
94                  Character charIndex = new Character(curChar);
95                  String replacementStr = w1252ToISO.get(charIndex);
96                  if (replacementStr != null) {
97                      transcodedValue.append(replacementStr);
98                  }
99              }
100             else {
101                 transcodedValue.append(curChar);
102             }
103         }
104         return transcodedValue.toString();
105     }
106 
107     /*** convert from UTF-8 encoded HTML-Pages -> internal Java String Format
108      * @param s string to conver
109      * @return converted string 
110      */
111     public static String convertFromUTF8(String s) {
112         String out = null;
113         try {
114             out = new String(s.getBytes("ISO-8859-1"), "UTF-8");
115         }
116         catch (java.io.UnsupportedEncodingException e) {
117             return null;
118         }
119         return out;
120     }
121 
122     /*** convert from internal Java String Format -> UTF-8 encoded HTML/JSP-Pages
123      * @param s string to conver
124      * @return converted string 
125      */
126     public static String convertToUTF8(String s) {
127         String out = null;
128         try {
129             out = new String(s.getBytes("UTF-8"));
130         }
131         catch (java.io.UnsupportedEncodingException e) {
132             return null;
133         }
134         return out;
135     }
136     
137     public int compareAccentuatedCharacters(String s1, String s2, Locale locale) {
138         // (javadoc)
139         // The result of String.compareTo() is a negative integer
140         // if this String object lexicographically precedes the
141         // argument string. The result is a positive integer if
142         // this String object lexicographically follows the argument
143         // string. The result is zero if the strings are equal;
144         // compareTo returns 0 exactly when the equals(Object)
145         // method would return true.
146 
147         // (javadoc)
148         // Collator.compare() compares the source string to the target string
149         // according to the collation rules for this Collator.
150         // Returns an integer less than, equal to or greater than zero
151         // depending on whether the source String is less than,
152         // equal to or greater than the target string.
153         Collator collator = Collator.getInstance(locale);
154         collator.setStrength(java.text.Collator.CANONICAL_DECOMPOSITION);
155         //  or  collator.setStrength(java.text.Collator.SECONDARY); to be non case sensitive  
156         return collator.compare(s1, s2);
157     }
158 
159     public boolean equalAccentuatedCharacters(String s1, String s2, Locale locale) {
160         Collator collator = Collator.getInstance(locale);
161         collator.setStrength(java.text.Collator.CANONICAL_DECOMPOSITION);
162         collator.setStrength(java.text.Collator.SECONDARY); // to be non case sensitive  
163         return collator.compare(s1, s2) == 0;
164     }
165 
166 }