  • 浏览: 11790351 次
  • 性别: Icon_minigender_1
  • 来自: 深圳






  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.InputStreamReader;
  5. public class ReadTxtFile {
  6. public static void main(String[] args) {
  7. try {
  8. String charsetName = "UTF-8";
  9. String path = "D:/to_delete/test.txt";
  10. File file = new File(path);
  11. if (file.isFile() && file.exists())
  12. {
  13. InputStreamReader insReader = new InputStreamReader(
  14. new FileInputStream(file), charsetName);
  15. BufferedReader bufReader = new BufferedReader(insReader);
  16. String line = new String();
  17. while ((line = bufReader.readLine()) != null) {
  18. System.out.println(line);
  19. }
  20. bufReader.close();
  21. insReader.close();
  22. }
  23. } catch (Exception e) {
  24. System.out.println("读取文件内容操作出错");
  25. e.printStackTrace();
  26. }
  27. }
  28. }









关键字: java 读utf-8, java写utf-8, 编码, utf-8 乱码




Java代码 复制代码
  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. public class UTF8Test {
  7. public static void main(String[] args) throws IOException {
  8. File f = new File("./utf.txt");
  9. FileInputStream in = new FileInputStream(f);
  10. // 指定读取文件时以UTF-8的格式读取
  11. BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
  12. String line = br.readLine();
  13. while(line != null)
  14. {
  15. System.out.println(line);
  16. line = br.readLine();
  17. }
  18. }
  19. }
  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. public class UTF8Test {
  7. public static void main(String[] args) throws IOException {
  8. File f = new File("./utf.txt");
  9. FileInputStream in = new FileInputStream(f);
  10. // 指定读取文件时以UTF-8的格式读取
  11. BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
  12. String line = br.readLine();
  13. while(line != null)
  14. {
  15. System.out.println(line);
  16. line = br.readLine();
  17. }
  18. }
  19. }


This is the first line.
This is second line.


?This is the first line.
This is second line.

通过上面的几篇文章应该可以想到是Java读取BOM(Byte Order Mark)的问题,在使用UTF-8时,可以在文件的开始使用3个字节的"EF BB BF"来标识文件使用了UTF-8的编码,当然也可以不用这个3个字节。
Bug ID:4508058
不过在我关掉的一些页面中记得有篇文件说这个bug只在jdk1.5及之前的版本才有,说是1.6已经解决了,从目前来看1.6只是解决了读取带有BOM文件失败的问题,还是不能区别处理有BOM和无BOM的UTF-8编码的文件,从Bug ID:4508058里的描述可以看出,这个问题将作为一个不会修改的问题关闭,对于BOM编码的识别将由应用程序自己来处理,原因可从另处一个bug处查看到,因为Unicode对于BOM的编码的规定可能发生变化。也就是说对于一个UTF-8的文件,应用程序需要知道这个文件有没有写BOM,然后自己决定处理BOM的方式。

Java代码 复制代码
  1. byte[] allbytes = line.getBytes("UTF-8");
  2. for (int i=0; i < allbytes.length; i++)
  3. {
  4. int tmp = allbytes[i];
  5. String hexString = Integer.toHexString(tmp);
  6. // 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充
  7. hexString = hexString.substring(hexString.length() -2);
  8. System.out.print(hexString.toUpperCase());
  9. System.out.print(" ");
  10. }
  1. byte[] allbytes = line.getBytes("UTF-8");
  2. for (int i=0; i < allbytes.length; i++)
  3. {
  4. int tmp = allbytes[i];
  5. String hexString = Integer.toHexString(tmp);
  6. // 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充
  7. hexString = hexString.substring(hexString.length() -2);
  8. System.out.print(hexString.toUpperCase());
  9. System.out.print(" ");
  10. }


EF BB BF 54 68 69 73 20 69 73 20 74 68 65 20 66 69 72 73 74 20 6C 69 6E 65 2E
?This is the first line.
54 68 69 73 20 69 73 20 73 65 63 6F 6E 64 20 6C 69 6E 65 2E
This is second line.

红色部分的"EF BB BF"刚好是UTF-8文件的BOM编码,可以看出Java在读文件时没能正确处理UTF-8文件的BOM编码,将前3个字节当作文本内容来处理了。


Java代码 复制代码
  1. BufferedReader br = new BufferedReader(new UnicodeReader(in, Charset.defaultCharset().name()));
  1. BufferedReader br = new BufferedReader(new UnicodeReader(in, Charset.defaultCharset().name()));





  1. /**
  2. version: 1.1 / 2007-01-25
  3. - changed BOM recognition ordering (longer boms first)
  4. Original pseudocode : Thomas Weidenfeller
  5. Implementation tweaked: Aki Nieminen
  6. http://www.unicode.org/unicode/faq/utf_bom.html
  7. BOMs in byte length ordering:
  8. 00 00 FE FF = UTF-32, big-endian
  9. FF FE 00 00 = UTF-32, little-endian
  10. EF BB BF = UTF-8,
  11. FE FF = UTF-16, big-endian
  12. FF FE = UTF-16, little-endian
  13. Win2k Notepad:
  14. Unicode format = UTF-16LE
  15. ***/
  16. import java.io.*;
  17. /**
  18. * This inputstream will recognize unicode BOM marks and will skip bytes if
  19. * getEncoding() method is called before any of the read(...) methods.
  20. *
  21. * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault
  22. * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new
  23. * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip
  24. * possible BOM bytes InputStreamReader in; if (enc == null) in = new
  25. * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);
  26. */
  27. public class UnicodeInputStream extends InputStream {
  28. PushbackInputStream internalIn;
  29. boolean isInited = false;
  30. String defaultEnc;
  31. String encoding;
  32. private static final int BOM_SIZE = 4;
  33. UnicodeInputStream(InputStream in, String defaultEnc) {
  34. internalIn = new PushbackInputStream(in, BOM_SIZE);
  35. this.defaultEnc = defaultEnc;
  36. }
  37. public String getDefaultEncoding() {
  38. return defaultEnc;
  39. }
  40. public String getEncoding() {
  41. if (!isInited) {
  42. try {
  43. init();
  44. } catch (IOException ex) {
  45. IllegalStateException ise = new IllegalStateException(
  46. "Init method failed.");
  47. ise.initCause(ise);
  48. throw ise;
  49. }
  50. }
  51. return encoding;
  52. }
  53. /**
  54. * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
  55. * back to the stream, only BOM bytes are skipped.
  56. */
  57. protected void init() throws IOException {
  58. if (isInited)
  59. return;
  60. byte bom[] = new byte[BOM_SIZE];
  61. int n, unread;
  62. n = internalIn.read(bom, 0, bom.length);
  63. if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
  64. && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
  65. encoding = "UTF-32BE";
  66. unread = n - 4;
  67. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
  68. && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
  69. encoding = "UTF-32LE";
  70. unread = n - 4;
  71. } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
  72. && (bom[2] == (byte) 0xBF)) {
  73. encoding = "UTF-8";
  74. unread = n - 3;
  75. } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
  76. encoding = "UTF-16BE";
  77. unread = n - 2;
  78. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
  79. encoding = "UTF-16LE";
  80. unread = n - 2;
  81. } else {
  82. // Unicode BOM mark not found, unread all bytes
  83. encoding = defaultEnc;
  84. unread = n;
  85. }
  86. // System.out.println("read=" + n + ", unread=" + unread);
  87. if (unread > 0)
  88. internalIn.unread(bom, (n - unread), unread);
  89. isInited = true;
  90. }
  91. public void close() throws IOException {
  92. // init();
  93. isInited = true;
  94. internalIn.close();
  95. }
  96. public int read() throws IOException {
  97. // init();
  98. isInited = true;
  99. return internalIn.read();
  100. }
  101. }


  1. /**
  2. version: 1.1 / 2007-01-25
  3. - changed BOM recognition ordering (longer boms first)
  4. Original pseudocode : Thomas Weidenfeller
  5. Implementation tweaked: Aki Nieminen
  6. http://www.unicode.org/unicode/faq/utf_bom.html
  7. BOMs:
  8. 00 00 FE FF = UTF-32, big-endian
  9. FF FE 00 00 = UTF-32, little-endian
  10. EF BB BF = UTF-8,
  11. FE FF = UTF-16, big-endian
  12. FF FE = UTF-16, little-endian
  13. Win2k Notepad:
  14. Unicode format = UTF-16LE
  15. ***/
  16. import java.io.*;
  17. /**
  18. * Generic unicode textreader, which will use BOM mark to identify the encoding
  19. * to be used. If BOM is not found then use a given default or system encoding.
  20. */
  21. public class UnicodeReader extends Reader {
  22. PushbackInputStream internalIn;
  23. InputStreamReader internalIn2 = null;
  24. String defaultEnc;
  25. private static final int BOM_SIZE = 4;
  26. /**
  27. *
  28. * @param in
  29. * inputstream to be read
  30. * @param defaultEnc
  31. * default encoding if stream does not have BOM marker. Give NULL
  32. * to use system-level default.
  33. */
  34. UnicodeReader(InputStream in, String defaultEnc) {
  35. internalIn = new PushbackInputStream(in, BOM_SIZE);
  36. this.defaultEnc = defaultEnc;
  37. }
  38. public String getDefaultEncoding() {
  39. return defaultEnc;
  40. }
  41. /**
  42. * Get stream encoding or NULL if stream is uninitialized. Call init() or
  43. * read() method to initialize it.
  44. */
  45. public String getEncoding() {
  46. if (internalIn2 == null)
  47. return null;
  48. return internalIn2.getEncoding();
  49. }
  50. /**
  51. * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
  52. * back to the stream, only BOM bytes are skipped.
  53. */
  54. protected void init() throws IOException {
  55. if (internalIn2 != null)
  56. return;
  57. String encoding;
  58. byte bom[] = new byte[BOM_SIZE];
  59. int n, unread;
  60. n = internalIn.read(bom, 0, bom.length);
  61. if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
  62. && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
  63. encoding = "UTF-32BE";
  64. unread = n - 4;
  65. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
  66. && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
  67. encoding = "UTF-32LE";
  68. unread = n - 4;
  69. } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
  70. && (bom[2] == (byte) 0xBF)) {
  71. encoding = "UTF-8";
  72. unread = n - 3;
  73. } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
  74. encoding = "UTF-16BE";
  75. unread = n - 2;
  76. } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
  77. encoding = "UTF-16LE";
  78. unread = n - 2;
  79. } else {
  80. // Unicode BOM mark not found, unread all bytes
  81. encoding = defaultEnc;
  82. unread = n;
  83. }
  84. // System.out.println("read=" + n + ", unread=" + unread);
  85. if (unread > 0)
  86. internalIn.unread(bom, (n - unread), unread);
  87. // Use given encoding
  88. if (encoding == null) {
  89. internalIn2 = new InputStreamReader(internalIn);
  90. } else {
  91. internalIn2 = new InputStreamReader(internalIn, encoding);
  92. }
  93. }
  94. public void close() throws IOException {
  95. init();
  96. internalIn2.close();
  97. }
  98. public int read(char[] cbuf, int off, int len) throws IOException {
  99. init();
  100. return internalIn2.read(cbuf, off, len);
  101. }
  102. }


  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. import java.nio.charset.Charset;
  7. public class UTF8Test {
  8. public static void main(String[] args) throws IOException {
  9. File f = new File("./utf.txt");
  10. FileInputStream in = new FileInputStream(f);
  11. // 指定读取文件时以UTF-8的格式读取
  12. // BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
  13. BufferedReader br = new BufferedReader(new UnicodeReader(in, Charset.defaultCharset().name()));
  14. String line = br.readLine();
  15. while(line != null)
  16. {
  17. System.out.println(line);
  18. line = br.readLine();
  19. }
  20. }
  21. }



Global site tag (gtag.js) - Google Analytics