18
18
import java .security .MessageDigest ;
19
19
import java .security .NoSuchAlgorithmException ;
20
20
import java .util .ArrayList ;
21
+ import java .util .BitSet ;
21
22
import java .util .Collections ;
22
23
import java .util .Comparator ;
23
24
import java .util .List ;
@@ -37,6 +38,8 @@ public final class FileUtils {
37
38
38
39
private static final String LINE_SEP = System .getProperty ("line.separator" );
39
40
41
+ private static final int BYTE_SIZE = 8 ;
42
+
40
43
private FileUtils () {
41
44
throw new UnsupportedOperationException ("u can't instantiate me..." );
42
45
}
@@ -903,109 +906,121 @@ public static String getFileCharsetSimple(final File file) {
903
906
case 0xfeff :
904
907
return "UTF-16BE" ;
905
908
default :
906
- return "GBK" ;
909
+ try {
910
+ if (isUtf8 (file )) {
911
+ return "UTF-8" ;
912
+ } else {
913
+ return "GBK" ;
914
+ }
915
+ } catch (Exception e ) {
916
+ e .printStackTrace ();
917
+ return "GBK" ;
918
+ }
907
919
}
908
920
}
909
921
910
922
/**
911
923
* Return whether the charset of file is utf8.
912
924
*
913
- * @param filePath The path of file.
925
+ * @param file The file.
914
926
* @return {@code true}: yes<br>{@code false}: no
915
927
*/
916
- public static boolean isUtf8 (final String filePath ) {
917
- return isUtf8 (getFileByPath (filePath ));
928
+ private static boolean isUtf8 (File file ) throws Exception {
929
+ BufferedInputStream bis = new BufferedInputStream (new FileInputStream (file ));
930
+ // 读取第一个字节
931
+ int code = bis .read ();
932
+ do {
933
+ BitSet bitSet = convert2BitSet (code );
934
+ if (bitSet .get (0 )) {
935
+ // 多字节时,再读取N个字节
936
+ if (!checkMultiByte (bis , bitSet )) {
937
+ bis .close ();
938
+ return false ;
939
+ }
940
+ }
941
+ // 单字节时什么都不用做,再次读取字节
942
+ code = bis .read ();
943
+ } while (code != -1 );
944
+ bis .close ();
945
+ return true ;
918
946
}
919
947
948
+
920
949
/**
921
- * Return whether the charset of file is utf8.
922
- *
923
- * @param file The file.
924
- * @return {@code true}: yes<br>{@code false}: no
950
+ * 检测多字节,判断是否符合utf8编码
925
951
*/
926
- public static boolean isUtf8 (final File file ) {
927
- if (file == null ) return false ;
928
- InputStream is = null ;
929
- try {
930
- byte [] bytes = new byte [24 ];
931
- is = new BufferedInputStream (new FileInputStream (file ));
932
- int read = is .read (bytes );
933
- if (read != -1 ) {
934
- byte [] readArr = new byte [read ];
935
- System .arraycopy (bytes , 0 , readArr , 0 , read );
936
- return isUtf8 (readArr ) == 100 ;
937
- } else {
952
+ private static boolean checkMultiByte (BufferedInputStream bis , BitSet bitSet ) throws Exception {
953
+ int count = getCountOfSequential (bitSet );
954
+ // 已经读取了一个字节,不能再读取
955
+ byte [] bytes = new byte [count - 1 ];
956
+ bis .read (bytes );
957
+ for (byte b : bytes ) {
958
+ if (!checkUtf8Byte (b )) {
938
959
return false ;
939
960
}
940
- } catch (IOException e ) {
941
- e .printStackTrace ();
942
- } finally {
943
- try {
944
- if (is != null ) {
945
- is .close ();
946
- }
947
- } catch (IOException e ) {
948
- e .printStackTrace ();
949
- }
950
961
}
951
- return false ;
962
+ return true ;
952
963
}
953
964
954
- private static int isUtf8 (byte [] raw ) {
955
- int i , len ;
956
- int utf8 = 0 , ascii = 0 ;
957
- if (raw .length > 3 ) {
958
- if ((raw [0 ] == (byte ) 0xEF ) && (raw [1 ] == (byte ) 0xBB ) && (raw [2 ] == (byte ) 0xBF )) {
959
- return 100 ;
965
+
966
+ /**
967
+ * 检测bitSet中从开始有多少个连续的1
968
+ */
969
+ private static int getCountOfSequential (BitSet bitSet ) {
970
+ int count = 0 ;
971
+ for (int i = 0 ; i < BYTE_SIZE ; i ++) {
972
+ if (bitSet .get (i )) {
973
+ count ++;
974
+ } else {
975
+ break ;
960
976
}
961
977
}
962
- len = raw .length ;
963
- int child = 0 ;
964
- for (i = 0 ; i < len ; ) {
965
- if ((raw [i ] & (byte ) 0xFF ) == (byte ) 0xFF || (raw [i ] & (byte ) 0xFE ) == (byte ) 0xFE ) {
966
- return 0 ;
967
- }
968
- if (child == 0 ) {
969
- if ((raw [i ] & (byte ) 0x7F ) == raw [i ] && raw [i ] != 0 ) {
970
- ascii ++;
971
- } else if ((raw [i ] & (byte ) 0xC0 ) == (byte ) 0xC0 ) {
972
- for (int bit = 0 ; bit < 8 ; bit ++) {
973
- if ((((byte ) (0x80 >> bit )) & raw [i ]) == ((byte ) (0x80 >> bit ))) {
974
- child = bit ;
975
- } else {
976
- break ;
977
- }
978
- }
979
- utf8 ++;
980
- }
981
- i ++;
982
- } else {
983
- child = (raw .length - i > child ) ? child : (raw .length - i );
984
- boolean currentNotUtf8 = false ;
985
- for (int children = 0 ; children < child ; children ++) {
986
- if ((raw [i + children ] & ((byte ) 0x80 )) != ((byte ) 0x80 )) {
987
- if ((raw [i + children ] & (byte ) 0x7F ) == raw [i + children ] && raw [i ] != 0 ) {
988
- ascii ++;
989
- }
990
- currentNotUtf8 = true ;
991
- }
992
- }
993
- if (currentNotUtf8 ) {
994
- utf8 --;
995
- i ++;
996
- } else {
997
- utf8 += child ;
998
- i += child ;
999
- }
1000
- child = 0 ;
978
+ return count ;
979
+ }
980
+
981
+
982
+ /**
983
+ * 检测单字节,判断是否为utf8
984
+ */
985
+ private static boolean checkUtf8Byte (byte b ) throws Exception {
986
+ BitSet bitSet = convert2BitSet (b );
987
+ return bitSet .get (0 ) && !bitSet .get (1 );
988
+ }
989
+
990
+
991
+ /**
992
+ * 将整形转为BitSet
993
+ */
994
+ private static BitSet convert2BitSet (int code ) {
995
+ BitSet bitSet = new BitSet (BYTE_SIZE );
996
+
997
+ for (int i = 0 ; i < BYTE_SIZE ; i ++) {
998
+ int tmp3 = code >> (BYTE_SIZE - i - 1 );
999
+ int tmp2 = 0x1 & tmp3 ;
1000
+ if (tmp2 == 1 ) {
1001
+ bitSet .set (i );
1001
1002
}
1002
1003
}
1003
- if (ascii == len ) {
1004
- return 100 ;
1004
+ return bitSet ;
1005
+ }
1006
+
1007
+ /**
1008
+ * Return whether the charset of file is utf8.
1009
+ *
1010
+ * @param filePath The path of file.
1011
+ * @return {@code true}: yes<br>{@code false}: no
1012
+ */
1013
+ public static boolean isUtf8 (final String filePath ) {
1014
+ try {
1015
+ return isUtf8 (getFileByPath (filePath ));
1016
+ } catch (Exception e ) {
1017
+ e .printStackTrace ();
1018
+ return false ;
1005
1019
}
1006
- return (int ) (100 * ((float ) (utf8 + ascii ) / (float ) len ));
1007
1020
}
1008
1021
1022
+
1023
+
1009
1024
/**
1010
1025
* Return the number of lines of file.
1011
1026
*
0 commit comments