001 /*
002 * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $
003 *
004 * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
005 *
006 * Redistribution and use of this software and associated documentation
007 * ("Software"), with or without modification, are permitted provided that the
008 * following conditions are met:
009 * 1. Redistributions of source code must retain copyright statements and
010 * notices. Redistributions must also contain a copy of this document.
011 * 2. Redistributions in binary form must reproduce the above copyright
012 * notice, this list of conditions and the following disclaimer in the
013 * documentation and/or other materials provided with the distribution.
014 * 3. The name "groovy" must not be used to endorse or promote products
015 * derived from this Software without prior written permission of The Codehaus.
016 * For written permission, please contact info@codehaus.org.
017 * 4. Products derived from this Software may not be called "groovy" nor may
018 * "groovy" appear in their names without prior written permission of The
019 * Codehaus. "groovy" is a registered trademark of The Codehaus.
020 * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
021 *
022 * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
023 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
024 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
025 * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
026 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
027 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
028 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
029 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
030 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
031 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
032 * DAMAGE.
033 *
034 */
035
036 package groovy.util;
037
038 import java.io.*;
039 import java.nio.charset.Charset;
040 import java.util.*;
041
042 /**
043 * <p>Utility class to guess the encoding of a given text file.</p>
044 *
045 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
046 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
047 * is wide enough, the charset should also be discovered.</p>
048 *
049 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
050 *
051 * <p>Usage:</p>
052 * <pre>
053 * // guess the encoding
054 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
055 *
056 * // create a reader with the correct charset
057 * CharsetToolkit toolkit = new CharsetToolkit(file);
058 * BufferedReader reader = toolkit.getReader();
059 *
060 * // read the file content
061 * String line;
062 * while ((line = br.readLine())!= null)
063 * {
064 * System.out.println(line);
065 * }
066 * </pre>
067 *
068 * @author Guillaume Laforge
069 */
070 public class CharsetToolkit {
071 private byte[] buffer;
072 private Charset defaultCharset;
073 private Charset charset;
074 private boolean enforce8Bit = true;
075 private File file;
076
077 /**
078 * Constructor of the <code>CharsetToolkit</code> utility class.
079 *
080 * @param file of which we want to know the encoding.
081 */
082 public CharsetToolkit(File file) throws IOException {
083 this.file = file;
084 this.defaultCharset = getDefaultSystemCharset();
085 this.charset = null;
086 InputStream input = new FileInputStream(file);
087 try {
088 byte[] bytes = new byte[4096];
089 int bytesRead = input.read(bytes);
090 if (bytesRead == -1) {
091 this.buffer = new byte[0];
092 }
093 else if (bytesRead < 4096) {
094 byte[] bytesToGuess = new byte[bytesRead];
095 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
096 this.buffer = bytesToGuess;
097 }
098 else {
099 this.buffer = bytes;
100 }
101 } finally {
102 try {input.close();} catch (IOException e){}
103 }
104 }
105
106 /**
107 * Defines the default <code>Charset</code> used in case the buffer represents
108 * an 8-bit <code>Charset</code>.
109 *
110 * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
111 * if an 8-bit <code>Charset</code> is encountered.
112 */
113 public void setDefaultCharset(Charset defaultCharset) {
114 if (defaultCharset != null)
115 this.defaultCharset = defaultCharset;
116 else
117 this.defaultCharset = getDefaultSystemCharset();
118 }
119
120 public Charset getCharset() {
121 if (this.charset == null)
122 this.charset = guessEncoding();
123 return charset;
124 }
125
126 /**
127 * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
128 * It might be a file without any special character in the range 128-255, but that may be or become
129 * a file encoded with the default <code>charset</code> rather than US-ASCII.
130 *
131 * @param enforce a boolean specifying the use or not of US-ASCII.
132 */
133 public void setEnforce8Bit(boolean enforce) {
134 this.enforce8Bit = enforce;
135 }
136
137 /**
138 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
139 *
140 * @return a boolean representing the flag of use of US-ASCII.
141 */
142 public boolean getEnforce8Bit() {
143 return this.enforce8Bit;
144 }
145
146 /**
147 * Retrieves the default Charset
148 */
149 public Charset getDefaultCharset() {
150 return defaultCharset;
151 }
152
153 /**
154 * <p>Guess the encoding of the provided buffer.</p>
155 * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
156 * return the charset implied by this BOM. Otherwise, the file would not be a human
157 * readable text file.</p>
158 *
159 * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
160 * If it is not UTF-8, we assume the encoding is the default system encoding
161 * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
162 *
163 * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
164 * <pre>
165 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
166 * 0000 0000-0000 007F 0xxxxxxx
167 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
168 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
169 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
170 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
171 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
172 * </pre>
173 * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
174 *
175 * @return the Charset recognized.
176 */
177 private Charset guessEncoding() {
178 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
179 // otherwise, the file would not be human readable
180 if (hasUTF8Bom())
181 return Charset.forName("UTF-8");
182 if (hasUTF16LEBom())
183 return Charset.forName("UTF-16LE");
184 if (hasUTF16BEBom())
185 return Charset.forName("UTF-16BE");
186
187 // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
188 // otherwise, the file is in US-ASCII
189 boolean highOrderBit = false;
190
191 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
192 // if it's not the case, we can assume the encoding is the default encoding of the system
193 boolean validU8Char = true;
194
195 // TODO the buffer is not read up to the end, but up to length - 6
196
197 int length = buffer.length;
198 int i = 0;
199 while (i < length - 6) {
200 byte b0 = buffer[i];
201 byte b1 = buffer[i + 1];
202 byte b2 = buffer[i + 2];
203 byte b3 = buffer[i + 3];
204 byte b4 = buffer[i + 4];
205 byte b5 = buffer[i + 5];
206 if (b0 < 0) {
207 // a high order bit was encountered, thus the encoding is not US-ASCII
208 // it may be either an 8-bit encoding or UTF-8
209 highOrderBit = true;
210 // a two-bytes sequence was encoutered
211 if (isTwoBytesSequence(b0)) {
212 // there must be one continuation byte of the form 10xxxxxx,
213 // otherwise the following characteris is not a valid UTF-8 construct
214 if (!isContinuationChar(b1))
215 validU8Char = false;
216 else
217 i++;
218 }
219 // a three-bytes sequence was encoutered
220 else if (isThreeBytesSequence(b0)) {
221 // there must be two continuation bytes of the form 10xxxxxx,
222 // otherwise the following characteris is not a valid UTF-8 construct
223 if (!(isContinuationChar(b1) && isContinuationChar(b2)))
224 validU8Char = false;
225 else
226 i += 2;
227 }
228 // a four-bytes sequence was encoutered
229 else if (isFourBytesSequence(b0)) {
230 // there must be three continuation bytes of the form 10xxxxxx,
231 // otherwise the following characteris is not a valid UTF-8 construct
232 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
233 validU8Char = false;
234 else
235 i += 3;
236 }
237 // a five-bytes sequence was encoutered
238 else if (isFiveBytesSequence(b0)) {
239 // there must be four continuation bytes of the form 10xxxxxx,
240 // otherwise the following characteris is not a valid UTF-8 construct
241 if (!(isContinuationChar(b1)
242 && isContinuationChar(b2)
243 && isContinuationChar(b3)
244 && isContinuationChar(b4)))
245 validU8Char = false;
246 else
247 i += 4;
248 }
249 // a six-bytes sequence was encoutered
250 else if (isSixBytesSequence(b0)) {
251 // there must be five continuation bytes of the form 10xxxxxx,
252 // otherwise the following characteris is not a valid UTF-8 construct
253 if (!(isContinuationChar(b1)
254 && isContinuationChar(b2)
255 && isContinuationChar(b3)
256 && isContinuationChar(b4)
257 && isContinuationChar(b5)))
258 validU8Char = false;
259 else
260 i += 5;
261 }
262 else
263 validU8Char = false;
264 }
265 if (!validU8Char)
266 break;
267 i++;
268 }
269 // if no byte with an high order bit set, the encoding is US-ASCII
270 // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
271 if (!highOrderBit) {
272 // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
273 if (this.enforce8Bit)
274 return this.defaultCharset;
275 else
276 return Charset.forName("US-ASCII");
277 }
278 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
279 // otherwise the file would not be human readable
280 if (validU8Char)
281 return Charset.forName("UTF-8");
282 // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
283 return this.defaultCharset;
284 }
285
286 /**
287 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
288 *
289 * @param b a byte.
290 * @return true if it's a continuation char.
291 */
292 private static boolean isContinuationChar(byte b) {
293 return -128 <= b && b <= -65;
294 }
295
296 /**
297 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
298 *
299 * @param b a byte.
300 * @return true if it's the first byte of a two-bytes sequence.
301 */
302 private static boolean isTwoBytesSequence(byte b) {
303 return -64 <= b && b <= -33;
304 }
305
306 /**
307 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
308 *
309 * @param b a byte.
310 * @return true if it's the first byte of a three-bytes sequence.
311 */
312 private static boolean isThreeBytesSequence(byte b) {
313 return -32 <= b && b <= -17;
314 }
315
316 /**
317 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
318 *
319 * @param b a byte.
320 * @return true if it's the first byte of a four-bytes sequence.
321 */
322 private static boolean isFourBytesSequence(byte b) {
323 return -16 <= b && b <= -9;
324 }
325
326 /**
327 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
328 *
329 * @param b a byte.
330 * @return true if it's the first byte of a five-bytes sequence.
331 */
332 private static boolean isFiveBytesSequence(byte b) {
333 return -8 <= b && b <= -5;
334 }
335
336 /**
337 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
338 *
339 * @param b a byte.
340 * @return true if it's the first byte of a six-bytes sequence.
341 */
342 private static boolean isSixBytesSequence(byte b) {
343 return -4 <= b && b <= -3;
344 }
345
346 /**
347 * Retrieve the default charset of the system.
348 *
349 * @return the default <code>Charset</code>.
350 */
351 public static Charset getDefaultSystemCharset() {
352 return Charset.forName(System.getProperty("file.encoding"));
353 }
354
355 /**
356 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
357 *
358 * @return true if the buffer has a BOM for UTF8.
359 */
360 public boolean hasUTF8Bom() {
361 if (buffer.length >= 3)
362 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
363 else
364 return false;
365 }
366
367 /**
368 * Has a Byte Order Marker for UTF-16 Low Endian
369 * (ucs-2le, ucs-4le, and ucs-16le).
370 *
371 * @return true if the buffer has a BOM for UTF-16 Low Endian.
372 */
373 public boolean hasUTF16LEBom() {
374 if (buffer.length >= 2)
375 return (buffer[0] == -1 && buffer[1] == -2);
376 else
377 return false;
378 }
379
380 /**
381 * Has a Byte Order Marker for UTF-16 Big Endian
382 * (utf-16 and ucs-2).
383 *
384 * @return true if the buffer has a BOM for UTF-16 Big Endian.
385 */
386 public boolean hasUTF16BEBom() {
387 if (buffer.length >= 2)
388 return (buffer[0] == -2 && buffer[1] == -1);
389 else
390 return false;
391 }
392
393 /**
394 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
395 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
396 * method <code>guessEncoding()</code>.
397 *
398 * @return a <code>BufferedReader</code>
399 * @throws FileNotFoundException if the file is not found.
400 */
401 public BufferedReader getReader() throws FileNotFoundException {
402 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
403 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
404 try {
405 reader.read();
406 }
407 catch (IOException e) {
408 // should never happen, as a file with no content
409 // but with a BOM has at least one char
410 }
411 }
412 return reader;
413 }
414
415 /**
416 * Retrieves all the available <code>Charset</code>s on the platform,
417 * among which the default <code>charset</code>.
418 *
419 * @return an array of <code>Charset</code>s.
420 */
421 public static Charset[] getAvailableCharsets() {
422 Collection collection = Charset.availableCharsets().values();
423 return (Charset[]) collection.toArray(new Charset[collection.size()]);
424 }
425 }