1.
2.
3.// 多字节编码转为UTF8编码
4.bool MBToUTF8(vector 5.{ 6. // convert an MBCS string to widechar 7. int32 nLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0); 8. 9. WCHAR* lpszW = NULL; 10. try 11. { 12. lpszW = new WCHAR[nLen]; 13. } 14. catch(bad_alloc &memExp) 15. { 16. return false; 17. } 18. 19. int32 nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, lpszW, nLen); 20. 21. if(nRtn != nLen) 22. { 23. delete[] lpszW; 24. return false; 25. } 26. // convert an widechar string to utf8 27. int32 utf8Len = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, NULL, 0, NULL, NULL); 28. if (utf8Len <= 0) 29. { 30. return false; 31. } 32. pu8.resize(utf8Len); 33. nRtn = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, &*pu8.begin(), utf8Len, NULL, NULL); 34. delete[] lpszW; 35. 36. if (nRtn != utf8Len) 37. { 38. pu8.clear(); 39. return false; 40. } 41. return true; 42.} 43. 44.// UTF8编码转为多字节编码 45.bool UTF8ToMB(vector 46.{ 47. // convert an UTF8 string to widechar 48. int32 nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0); 49. 50. WCHAR* lpszW = NULL; 51. try 52. { 53. lpszW = new WCHAR[nLen]; 54. } 55. catch(bad_alloc &memExp) 56. { 57. return false; 58. } 59. 60. int32 nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, lpszW, nLen); 61. 62. if(nRtn != nLen) 63. { . delete[] lpszW; 65. return false; 66. } 67. 68. // convert an widechar string to Multibyte 69. int32 MBLen = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, NULL, 0, NULL, NULL); 70. if (MBLen <=0) 71. { 72. return false; 73. } 74. pmb.resize(MBLen); 75. nRtn = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, &*pmb.begin(), MBLen, NULL, NULL); 76. delete[] lpszW; 77. 78. if(nRtn != MBLen) 79. { 80. pmb.clear(); 81. return false; 82. } 83. return true; 84.} 85. 86.// 多字节编码转为Unicode编码 87.bool MBToUnicode(vector 88.{ . // convert an MBCS string to widechar 90. int32 uLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0); 91. 92. if (uLen<=0) 93. { 94. return false; 95. } 96. pun.resize(uLen); 97. 98. int32 nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, &*pun.begin(), uLen); 99. 100. if (nRtn != uLen) 101. { 102. pun.clear(); 103. return false; 104. } 105. return true; 106.} 107. 108.//Unicode编码转为多字节编码 109.bool UnicodeToMB(vector 110.{ 111. // convert an widechar string to Multibyte 112. int32 MBLen = WideCharToMultiByte(CP_ACP, 0, pun, uLen, NULL, 0, NULL, NULL); 113. if (MBLen <=0) 114. { 115. return false; 116. } 117. pmb.resize(MBLen); 118. int nRtn = WideCharToMultiByte(CP_ACP, 0, pun, uLen, &*pmb.begin(), MBLen, NULL, NULL); 119. 120. if(nRtn != MBLen) 121. { 122. pmb.clear(); 123. return false; 124. } 125. return true; 126.} 127. 128.// UTF8编码转为Unicode 129.bool UTF8ToUnicode(vector 130.{ 131. // convert an UTF8 string to widechar 132. int32 nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0); 133. if (nLen <=0) 134. { 135. return false; 136. } 137. pun.resize(nLen); 138. int32 nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, &*pun.begin(), nLen); 139. 140. if(nRtn != nLen) 141. { 142. pun.clear(); 143. return false; 144. } 145. 146. return true; 147.} 148. 149.// Unicode编码转为UTF8 150.bool UnicodeToUTF8(vector 151.{ 152. // convert an widechar string to utf8 153. int32 utf8Len = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, NULL, 0, NULL, NULL); 154. if (utf8Len<=0) 155. { 156. return false; 157. } 158. pu8.resize(utf8Len); 159. int32 nRtn = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, &*pu8.begin(), utf8Len, NULL, NULL); 160. 161. if (nRtn != utf8Len) 162. { 163. pu8.clear(); 1. return false; 165. } 166. return true; 167.} VC中Ansi、Unicode、UTF8字符串之间的转换和写入文本 Ansi字符串我们最熟悉,英文占一个字节,汉字2个字节,以一个\\0结尾,常用于txt文本文件 Unicode字符串,每个字符(汉字、英文字母)都占2个字节,以2个连续的\\0结尾,NT操作系统内核用的是这种字符串,常被定义为typedef unsigned short wchar_t;所以我们有时常会见到什么char*无法转换为unsigned short*之类的错误,其实就是unicode UTF8是Unicode一种压缩形式,英文A在unicode中表示为0x0041,老外觉得这种存储方式太浪费,因为浪费了50%的空间,于是就把英文压缩成1个字节,成了utf8编码,但是汉字在utf8中占3个字节,显然用做中文不如ansi合算,这就是中国的网页用作ansi编码而老外的网页常用utf8的原因。 UTF8在还游戏里运用的很广泛,比如WOW的lua脚本等 下面来说一下转换,主要用代码来说明吧 写文件我用了CFile类,其实用FILE*之类的也是一样,写文件和字符串什么类别没有关系,硬件只关心数据和长度 Ansi转Unicode 介绍2种方法 void CConvertDlg::OnBnClickedButtonAnsiToUnicode() { // ansi to unicode char* szAnsi = "abcd1234你我他"; //预转换,得到所需空间的大小 int wcsLen = ::MultiByteToWideChar(CP_ACP, NULL, szAnsi, strlen(szAnsi), NULL, 0); //分配空间要给'\\0'留个空间,MultiByteToWideChar不会给'\\0'空间 wchar_t* wszString = new wchar_t[wcsLen + 1]; //转换 ::MultiByteToWideChar(CP_ACP, NULL, szAnsi, strlen(szAnsi), wszString, wcsLen); //最后加上'\\0' wszString[wcsLen] = '\\0'; //unicode版的MessageBox API ::MessageBoxW(GetSafeHwnd(), wszString, wszString, MB_OK); //接下来写入文本 //写文本文件,头2个字节0xfeff,低位0xff写在前 CFile cFile; cFile.Open(_T("1.txt"), CFile::modeWrite | CFile::modeCreate); //文件开头 cFile.SeekToBegin(); cFile.Write("\\xff\\xfe", 2); //写入内容 cFile.Write(wszString, wcsLen * sizeof(wchar_t)); cFile.Flush(); cFile.Close(); delete[] wszString; wszString =NULL; //方法2 //设置当前地域信息,不设置的话,使用这种方法,中文不会正确显示 //需要#include setlocale(LC_CTYPE, "chs"); wchar_t wcsStr[100]; //注意下面是大写S,在unicode中,代表后面是ansi字符串 //swprintf是sprintf的unicode版本 //格式的前面要加大写L,代表是unicode swprintf(wcsStr, L"%S", szAnsi); ::MessageBoxW(GetSafeHwnd(), wcsStr, wcsStr, MB_OK); } Unicode转Ansi 也是2种方法 void CConvertDlg::OnBnClickedButtonUnicodeToAnsi() { // unicode to ansi wchar_t* wszString = L"abcd1234你我他"; //预转换,得到所需空间的大小,这次用的函数和上面名字相反 int ansiLen = ::WideCharToMultiByte(CP_ACP, NULL, wszString, wcslen(wszString), NULL, 0, NULL, NULL); //同上,分配空间要给'\\0'留个空间 char* szAnsi = new char[ansiLen + 1]; //转换 //unicode版对应的strlen是wcslen ::WideCharToMultiByte(CP_ACP, NULL, wszString, wcslen(wszString), szAnsi, ansiLen, NULL, NULL); //最后加上'\\0' szAnsi[ansiLen] = '\\0'; //Ansi版的MessageBox API ::MessageBoxA(GetSafeHwnd(), szAnsi, szAnsi, MB_OK); //接下来写入文本 //写文本文件,ANSI文件没有BOM CFile cFile; cFile.Open(_T("1.txt"), CFile::modeWrite | CFile::modeCreate); //文件开头 cFile.SeekToBegin(); //写入内容 cFile.Write(szAnsi, ansiLen * sizeof(char)); cFile.Flush(); cFile.Close(); delete[] szAnsi; szAnsi =NULL; //方法2 //和上面一样有另一种方法 setlocale(LC_CTYPE, "chs"); char szStr[100]; //注意下面是大写,在ansi中,代表后面是unicode字符串 //sprintf sprintf(szStr, "%S", wszString); ::MessageBoxA(GetSafeHwnd(), szStr, szStr, MB_OK); } Unicode转UTF8 void CConvertDlg::OnBnClickedButtonUnicodeToU8() { // unicode to UTF8 wchar_t* wszString = L"abcd1234你我他"; //预转换,得到所需空间的大小,这次用的函数和上面名字相反 int u8Len = ::WideCharToMultiByte(CP_UTF8, NULL, wszString, wcslen(wszString), NULL, 0, NULL, NULL); //同上,分配空间要给'\\0'留个空间 //UTF8虽然是Unicode的压缩形式,但也是多字节字符串,所以可以以char的形式保存 char* szU8 = new char[u8Len + 1]; //转换 //unicode版对应的strlen是wcslen ::WideCharToMultiByte(CP_UTF8, NULL, wszString, wcslen(wszString), szU8, u8Len, NULL, NULL); //最后加上'\\0' szU8[u8Len] = '\\0'; //MessageBox不支持UTF8,所以只能写文件 //接下来写入文本 //写文本文件,UTF8的BOM是0xbfbbef CFile cFile; cFile.Open(_T("1.txt"), CFile::modeWrite | CFile::modeCreate); //文件开头 cFile.SeekToBegin(); //写BOM,同样低位写在前 cFile.Write("\\xef\\xbb\\xbf", 3); //写入内容 cFile.Write(szU8, u8Len * sizeof(char)); cFile.Flush(); cFile.Close(); delete[] szU8; szU8 =NULL; } UTF8转UNICODE void CConvertDlg::OnBnClickedButtonU8ToUnicode() { //UTF8 to Unicode //由于中文直接复制过来会成乱码,编译器有时会报错,故采用16进制形式 char* szU8 = "abcd1234\\xe4\\xbd\\xa0\\xe6\\x88\\x91\\xe4\\xbb\\x96\\x00"; //预转换,得到所需空间的大小 int wcsLen = ::MultiByteToWideChar(CP_UTF8, NULL, szU8, strlen(szU8), NULL, 0); //分配空间要给'\\0'留个空间,MultiByteToWideChar不会给'\\0'空间 wchar_t* wszString = new wchar_t[wcsLen + 1]; //转换 ::MultiByteToWideChar(CP_UTF8, NULL, szU8, strlen(szU8), wszString, wcsLen); //最后加上'\\0' wszString[wcsLen] = '\\0'; //unicode版的MessageBox API ::MessageBoxW(GetSafeHwnd(), wszString, wszString, MB_OK); //写文本同ansi to unicode } Ansi转换utf8和utf8转换Ansi就是上面2个的结合,把unicode作为中间量,进行2次转换即可 VC++ utf-8 Unicode GB2312 编码转换 #include #include #include using namespace std; void unicodeToUTF8(const wstring &src, string& result) { int n = WideCharToMultiByte( CP_UTF8, 0, src.c_str(), -1, 0, 0, 0, 0 ); result.resize(n); ::WideCharToMultiByte( CP_UTF8, 0, src.c_str(), -1, (char*)result.c_str(), result.length(), 0, 0 ); } void unicodeToGB2312(const wstring& wstr , string& result) { int n = WideCharToMultiByte( CP_ACP, 0, wstr.c_str(), -1, 0, 0, 0, 0 ); result.resize(n); ::WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), -1, (char*)result.c_str(), n, 0, 0 ); } void utf8ToUnicode(const string& src, wstring& result) { int n = MultiByteToWideChar( CP_UTF8, 0, src.c_str(), -1, NULL, 0 ); result.resize(n); ::MultiByteToWideChar( CP_UTF8, 0, src.c_str(), -1, (LPWSTR)result.c_str(), result.length()); } void gb2312ToUnicode(const string& src, wstring& result) { int n = MultiByteToWideChar( CP_ACP, 0, src.c_str(), -1, NULL, 0 ); result.resize(n); ::MultiByteToWideChar( CP_ACP, 0, src.c_str(), -1, (LPWSTR)result.c_str(), result.length()); } void printByte(string str) { int i=0; for (i=0; i printf("%02X ",(unsigned char)str.at(i)); } printf("/n"); } void wprintByte(wstring str) { int i=0; for (i=0; i printf("%02X ",*((unsigned char*)str.c_str()+i)); } printf("/n"); } int main() { string strText = "AB汉字"; string strUTF8; wstring wstrUnicode; string strGB2312; printf("ANSI =%s/n",strText.c_str()); gb2312ToUnicode(strText, wstrUnicode); printf("Unicode="); wprintByte(wstrUnicode); unicodeToUTF8(wstrUnicode, strUTF8); printf("UTF-8 ="); printByte(strUTF8); utf8ToUnicode(strUTF8,wstrUnicode); printf("Unicode="); wprintByte(wstrUnicode); unicodeToGB2312(wstrUnicode,strGB2312); printf("GB2312 ="); printByte(strGB2312); return 0; } Vc下unicode和UTF8相互转换 在vc下使用SQLite数据库时,由于SQL语句使用utf8 编码,而CString 是unicode编码。 一, utf8 转 Unicode CString UTF8ToUnicode(char* UTF8) { DWORD dwUnicodeLen; //转换后Unicode的长度 TCHAR *pwText; //保存Unicode的指针 CString strUnicode; //返回值 //获得转换后的长度,并分配内存 dwUnicodeLen = MultiByteToWideChar(CP_UTF8,0,UTF8,-1,NULL,0); pwText = new TCHAR[dwUnicodeLen]; if (!pwText) { return strUnicode; } //转为Unicode MultiByteToWideChar(CP_UTF8,0,UTF8,-1,pwText,dwUnicodeLen); //转为CString strUnicode.Format(_T("%s"),pwText); //清除内存 delete []pwText; //返回转换好的Unicode字串 return strUnicode; } 二, Unicode转utf8 size_t CDGQDialog::g_f_wctou8(char * dest_str, const wchar_t src_wchar) { int count_bytes = 0; wchar_t byte_one = 0, byte_other = 0x3f; // 用于位与运算以提取位值0x3f--->00111111 unsigned char utf_one = 0, utf_other = 0x80; // 用于"位或"置标UTF-8编码0x80--->1000000 wchar_t tmp_wchar =L'0'; // 用于宽字符位置析取和位移(右移位) unsigned char tmp_char =L'0'; if (!src_wchar)// return (size_t)-1; for (;;) // 检测字节序列长度 { if (src_wchar <= 0x7f){ // <=01111111 count_bytes = 1; // ASCII字符: 0xxxxxxx( ~ 01111111) byte_one = 0x7f; // 用于位与运算, 提取有效位值, 下同 utf_one = 0x0; break; } if ( (src_wchar > 0x7f) && (src_wchar <= 0x7ff) ){ // <=0111,11111111 count_bytes = 2; // 110xxxxx 10xxxxxx[1](最多个位, 简写为*1) byte_one = 0x1f; // 00011111, 下类推(1位的数量递减) utf_one = 0xc0; // 11000000 break; } if ( (src_wchar > 0x7ff) && (src_wchar <= 0xffff) ){ //0111,11111111<=11111111,11111111 count_bytes = 3; // 1110xxxx 10xxxxxx[2](MaxBits: 16*1) byte_one = 0xf; // 00001111 utf_one = 0xe0; // 11100000 break; } if ( (src_wchar > 0xffff) && (src_wchar <= 0x1fffff) ){ //对UCS-4的支持.. count_bytes = 4; // 11110xxx 10xxxxxx[3](MaxBits: 21*1) byte_one = 0x7; // 00000111 utf_one = 0xf0; // 11110000 break; } if ( (src_wchar > 0x1fffff) && (src_wchar <= 0x3ffffff) ){ count_bytes = 5; // 111110xx 10xxxxxx[4](MaxBits: 26*1) byte_one = 0x3; // 00000011 utf_one = 0xf8; // 11111000 break; } if ( (src_wchar > 0x3ffffff) && (src_wchar <= 0x7fffffff) ){ count_bytes = 6; // 1111110x 10xxxxxx[5](MaxBits: 31*1) byte_one = 0x1; // 00000001 utf_one = 0xfc; // 11111100 break; } return (size_t)-1; // 以上皆不满足则为非法序列 } // 以下几行析取宽字节中的相应位, 并分组为UTF-8编码的各个字节 tmp_wchar = src_wchar; for (int i = count_bytes; i > 1; i--) { // 一个宽字符的多字节降序赋值 tmp_char = (unsigned char)(tmp_wchar & byte_other);///后位与byte_other 00111111 dest_str[i - 1] = (tmp_char | utf_other);/// 在前面加----跟或 tmp_wchar >>= 6;//右移位 } //这个时候i=1 //对UTF-8第一个字节位处理, //第一个字节的开头"1"的数目就是整个串中字节的数目 tmp_char = (unsigned char)(tmp_wchar & byte_one);//根据上面附值得来,有效位个数 dest_str[0] = (tmp_char | utf_one);//根据上面附值得来1的个数 // 位值析取分组__End! return count_bytes; } int CDGQDialog::g_f_wcs_to_pchar(CString& wstr,char * p) { wchar_t wc=L'1'; char c[10]="1";//申请一个缓存 size_t r=0; //size_t unsigned integer Result of sizeof operator int i=0; int j=0; for(i=0;i wc=wstr.GetAt(i);//得到一个宽字符 r=g_f_wctou8(c,wc);//将一个宽字符按UTF-8格式转换到p地址 if(r==-1)//出错判断 AfxMessageBox(_T("wcs_to_pchar error")); p[j]=c[0];//第一个值附给p j++; if(r>1) { for(size_t x=1;x p[j]=c[x]; j++; } } } //p[j]='0'; return 1; } 三.转换实例 void CMytestDlg::OnBnClickedButton2() { // TODO: 在此添加控件通知处理程序代码 CString ccId=L"2007071王"; CString sql; char mySql[100]; memset(mySql,0,sizeof(mySql)); sql.Format(L"select cxrq,cxdw,dxrq,dxdw,fxrq,fxdw,cx,flx from j_clxx where trainnum_info_id ='%s'",ccId); //wchar_t sql=L'你'; g_f_wcs_to_pchar(sql,mySql); CString sql1 =UTF8ToUnicode(mySql); MessageBox(sql); //g_f_wctou8(mySql,sql); // CString str_temp; // for (int i=90;i // str_temp.Format(L"%c",mySql[i]); // MessageBox(str_temp); // } UTF8和UNICODE之间的转换(VC)(转载) CString CXXXDlg::UTF8Convert(CString &str, int sourceCodepage, int targetCodepage) { int len=str.GetLength(); int unicodeLen=MultiByteToWideChar(sourceCodepage,0,str,-1,NULL,0); wchar_t * pUnicode; pUnicode=new wchar_t[unicodeLen+1]; memset(pUnicode,0,(unicodeLen+1)*sizeof(wchar_t)); MultiByteToWideChar(sourceCodepage,0,str,-1,(LPWSTR)pUnicode,unicodeLen); BYTE * pTargetData=NULL; int targetLen=WideCharToMultiByte(targetCodepage,0,(LPWSTR)pUnicode,-1,(char *)pTargetData,0,NULL,NULL); pTargetData=new BYTE[targetLen+1]; memset(pTargetData,0,targetLen+1); WideCharToMultiByte(targetCodepage,0,(LPWSTR)pUnicode,-1,(char *)pTargetData,targetLen,NULL,NULL); CString rt; rt.Format("%s",pTargetData); pUnicode; pTargetData; return rt; } 例如: UTF8转UNICODE m_strUnicode = UTF8Convert(m_strUTF8,CP_UTF8,CP_ACP); UNICODE转UTF8 m_strUTF8 = UTF8Convert(m_strUnicode,CP_ACP,CP_UTF8);