Skip to content

Commit 6bc10a7

Browse files
committed
feat: FileUtils.getFileCharsetSimple()区分UTF-8无BOM和GBK编码
1 parent 3513c74 commit 6bc10a7

File tree

2 files changed

+96
-80
lines changed

2 files changed

+96
-80
lines changed

lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java

Lines changed: 94 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.security.MessageDigest;
1919
import java.security.NoSuchAlgorithmException;
2020
import java.util.ArrayList;
21+
import java.util.BitSet;
2122
import java.util.Collections;
2223
import java.util.Comparator;
2324
import java.util.List;
@@ -37,6 +38,8 @@ public final class FileUtils {
3738

3839
private static final String LINE_SEP = System.getProperty("line.separator");
3940

41+
private static final int BYTE_SIZE = 8;
42+
4043
private FileUtils() {
4144
throw new UnsupportedOperationException("u can't instantiate me...");
4245
}
@@ -903,109 +906,121 @@ public static String getFileCharsetSimple(final File file) {
903906
case 0xfeff:
904907
return "UTF-16BE";
905908
default:
906-
return "GBK";
909+
try {
910+
if (isUtf8(file)) {
911+
return "UTF-8";
912+
} else {
913+
return "GBK";
914+
}
915+
} catch (Exception e) {
916+
e.printStackTrace();
917+
return "GBK";
918+
}
907919
}
908920
}
909921

910922
/**
911923
* Return whether the charset of file is utf8.
912924
*
913-
* @param filePath The path of file.
925+
* @param file The file.
914926
* @return {@code true}: yes<br>{@code false}: no
915927
*/
916-
public static boolean isUtf8(final String filePath) {
917-
return isUtf8(getFileByPath(filePath));
928+
private static boolean isUtf8(File file) throws Exception {
929+
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
930+
// 读取第一个字节
931+
int code = bis.read();
932+
do {
933+
BitSet bitSet = convert2BitSet(code);
934+
if (bitSet.get(0)) {
935+
// 多字节时,再读取N个字节
936+
if (!checkMultiByte(bis, bitSet)) {
937+
bis.close();
938+
return false;
939+
}
940+
}
941+
// 单字节时什么都不用做,再次读取字节
942+
code = bis.read();
943+
} while (code != -1);
944+
bis.close();
945+
return true;
918946
}
919947

948+
920949
/**
921-
* Return whether the charset of file is utf8.
922-
*
923-
* @param file The file.
924-
* @return {@code true}: yes<br>{@code false}: no
950+
* 检测多字节,判断是否符合utf8编码
925951
*/
926-
public static boolean isUtf8(final File file) {
927-
if (file == null) return false;
928-
InputStream is = null;
929-
try {
930-
byte[] bytes = new byte[24];
931-
is = new BufferedInputStream(new FileInputStream(file));
932-
int read = is.read(bytes);
933-
if (read != -1) {
934-
byte[] readArr = new byte[read];
935-
System.arraycopy(bytes, 0, readArr, 0, read);
936-
return isUtf8(readArr) == 100;
937-
} else {
952+
private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
953+
int count = getCountOfSequential(bitSet);
954+
// 已经读取了一个字节,不能再读取
955+
byte[] bytes = new byte[count - 1];
956+
bis.read(bytes);
957+
for (byte b : bytes) {
958+
if (!checkUtf8Byte(b)) {
938959
return false;
939960
}
940-
} catch (IOException e) {
941-
e.printStackTrace();
942-
} finally {
943-
try {
944-
if (is != null) {
945-
is.close();
946-
}
947-
} catch (IOException e) {
948-
e.printStackTrace();
949-
}
950961
}
951-
return false;
962+
return true;
952963
}
953964

954-
private static int isUtf8(byte[] raw) {
955-
int i, len;
956-
int utf8 = 0, ascii = 0;
957-
if (raw.length > 3) {
958-
if ((raw[0] == (byte) 0xEF) && (raw[1] == (byte) 0xBB) && (raw[2] == (byte) 0xBF)) {
959-
return 100;
965+
966+
/**
967+
* 检测bitSet中从开始有多少个连续的1
968+
*/
969+
private static int getCountOfSequential(BitSet bitSet) {
970+
int count = 0;
971+
for (int i = 0; i < BYTE_SIZE; i++) {
972+
if (bitSet.get(i)) {
973+
count++;
974+
} else {
975+
break;
960976
}
961977
}
962-
len = raw.length;
963-
int child = 0;
964-
for (i = 0; i < len; ) {
965-
if ((raw[i] & (byte) 0xFF) == (byte) 0xFF || (raw[i] & (byte) 0xFE) == (byte) 0xFE) {
966-
return 0;
967-
}
968-
if (child == 0) {
969-
if ((raw[i] & (byte) 0x7F) == raw[i] && raw[i] != 0) {
970-
ascii++;
971-
} else if ((raw[i] & (byte) 0xC0) == (byte) 0xC0) {
972-
for (int bit = 0; bit < 8; bit++) {
973-
if ((((byte) (0x80 >> bit)) & raw[i]) == ((byte) (0x80 >> bit))) {
974-
child = bit;
975-
} else {
976-
break;
977-
}
978-
}
979-
utf8++;
980-
}
981-
i++;
982-
} else {
983-
child = (raw.length - i > child) ? child : (raw.length - i);
984-
boolean currentNotUtf8 = false;
985-
for (int children = 0; children < child; children++) {
986-
if ((raw[i + children] & ((byte) 0x80)) != ((byte) 0x80)) {
987-
if ((raw[i + children] & (byte) 0x7F) == raw[i + children] && raw[i] != 0) {
988-
ascii++;
989-
}
990-
currentNotUtf8 = true;
991-
}
992-
}
993-
if (currentNotUtf8) {
994-
utf8--;
995-
i++;
996-
} else {
997-
utf8 += child;
998-
i += child;
999-
}
1000-
child = 0;
978+
return count;
979+
}
980+
981+
982+
/**
983+
* 检测单字节,判断是否为utf8
984+
*/
985+
private static boolean checkUtf8Byte(byte b) throws Exception {
986+
BitSet bitSet = convert2BitSet(b);
987+
return bitSet.get(0) && !bitSet.get(1);
988+
}
989+
990+
991+
/**
992+
* 将整形转为BitSet
993+
*/
994+
private static BitSet convert2BitSet(int code) {
995+
BitSet bitSet = new BitSet(BYTE_SIZE);
996+
997+
for (int i = 0; i < BYTE_SIZE; i++) {
998+
int tmp3 = code >> (BYTE_SIZE - i - 1);
999+
int tmp2 = 0x1 & tmp3;
1000+
if (tmp2 == 1) {
1001+
bitSet.set(i);
10011002
}
10021003
}
1003-
if (ascii == len) {
1004-
return 100;
1004+
return bitSet;
1005+
}
1006+
1007+
/**
1008+
* Return whether the charset of file is utf8.
1009+
*
1010+
* @param filePath The path of file.
1011+
* @return {@code true}: yes<br>{@code false}: no
1012+
*/
1013+
public static boolean isUtf8(final String filePath) {
1014+
try {
1015+
return isUtf8(getFileByPath(filePath));
1016+
} catch (Exception e) {
1017+
e.printStackTrace();
1018+
return false;
10051019
}
1006-
return (int) (100 * ((float) (utf8 + ascii) / (float) len));
10071020
}
10081021

1022+
1023+
10091024
/**
10101025
* Return the number of lines of file.
10111026
*
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
GBK
1+
GBK
2+
�Ұ��й�

0 commit comments

Comments
 (0)