* @return char - 变换后的Unicode字符
*/
public static char UTFC2UniC(byte[] utf, int sptr, int cntBits)
{
/*
Unicode <-> UTF-8
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
int uniC = 0; // represent the unicode char
byte firstByte = utf[sptr];
int ptr = 0; // pointer 0 ~ 15
// resolve single byte UTF-8 encoding char
if(cntBits == 0)
return (char) firstByte;
// resolve the first byte
firstByte &= (1 << (7 - cntBits)) - 1;
// resolve multiple bytes UTF-8 encoding char(except the first byte)
for(int i = sptr + cntBits - 1; i > sptr; --i)
{
byte utfb = utf[i];
uniC |= (utfb & 0x3f) << ptr;
ptr += 6;
}
uniC |= firstByte << ptr;
return (char)uniC;
}
// 根据给定字节计算UTF-8编码的一个字符所占字节数
// UTF-8规则定义,字节标记只能为0或2~6
private static int getCntBits(byte b)
{
int cnt = 0;
if (b == 0) return -1;
for(int i = 7; i >= 0; --i)
{
if (((b >> i) & 0x1) == 1)
++cnt;
else
break;
}
return (cnt > 6 || cnt == 1) ? -1 : cnt;
}
参考资料:
《UTF-8 and Unicode FAQ》—— http://www.linuxforum.net/books/UTF-8-Unicode.html