| 09-14-2004, 02:00 PM | #1 |
For its text files that must be localized (such as the map script, the trigger strings file and the like) Warcraft uses a unicode format called UTF-8. This format uses one byte for the most common characters which is equal to the character's ASCII value. For example A = 65 or 0x41 For the more unusual characters it can take from 2 to 6 bytes per character. For example the German letter Ä would be represented by 195 and 164 or 0xC3A4 The higher the first byte is, the more bytes are required to represent the character. Simple modulo calculations are enough to convert UTF to common unicode and back. Here are my converting functions: Code:
Public Function UTF2UCS(ByVal CurStr As String) As String
Dim Length As Long, Index As Long, CurAsc As Long, Ret As String
Ret = ""
Length = Len(CurStr)
Index = 1
Do Until Index > Length
CurAsc = Asc(Mid(CurStr, Index, 1))
Select Case CurAsc
Case 0 To 191
Index = Index + 1
Ret = Ret & Chr(CurAsc)
Case 192 To 223
If Length - Index > 0 Then
Ret = Ret & Chr((CurAsc - 192) * 64 + (Asc(Mid(CurStr, Index + 1, 1)) - 128))
Index = Index + 2
Else
Index = Index + 1
End If
Case 224 To 239
If Length - Index > 1 Then
Ret = Ret & Chr((CurAsc - 224) * 4096 + (Asc(Mid(CurStr, Index + 1, 1)) - 128) * 64 + (Asc(Mid(CurStr, Index + 2, 1)) - 128))
Index = Index + 3
Else
Index = Index + 2
End If
Case 240 To 247
If Length - Index > 2 Then
Ret = Ret & Chr((CurAsc - 240) * 262144 + (Asc(Mid(CurStr, Index + 1, 1)) - 128) * 4096 + (Asc(Mid(CurStr, Index + 2, 1)) - 128) * 64 + (Asc(Mid(CurStr, Index + 3, 1)) - 128))
Index = Index + 4
Else
Index = Index + 3
End If
Case 248 To 251
If Length - Index > 3 Then
Ret = Ret & Chr((CurAsc - 248) * 16777216 + (Asc(Mid(CurStr, Index + 1, 1)) - 128) * 262144 + (Asc(Mid(CurStr, Index + 2, 1)) - 128) * 4096 + (Asc(Mid(CurStr, Index + 3, 1)) - 128) * 64 + (Asc(Mid(CurStr, Index + 4, 1)) - 128))
Index = Index + 5
Else
Index = Index + 4
End If
Case 252 To 253
If Length - Index > 4 Then
Ret = Ret & Chr((CurAsc - 252) * 1073741824 + (Asc(Mid(CurStr, Index + 1, 1)) - 128) * 16777216 + (Asc(Mid(CurStr, Index + 2, 1)) - 128) * 262144 + (Asc(Mid(CurStr, Index + 3, 1)) - 128) * 4096 + (Asc(Mid(CurStr, Index + 4, 1)) - 128) * 64 + (Asc(Mid(CurStr, Index + 5, 1)) - 128))
Index = Index + 6
Else
Index = Index + 5
End If
Case Else
Index = Index + 1
End Select
Loop
UTF2UCS = Ret
End Function
Public Function UCS2UTF(ByVal CurStr As String) As String
Dim Length As Long, Ret As String, Index As Long, CurAsc As Long
Length = Len(CurStr)
Ret = ""
Index = 1
Do Until Index > Length
CurAsc = Asc(Mid(CurStr, Index, 1))
Select Case CurAsc
Case 0 To 127
Ret = Ret & Chr(CurAsc)
Case 128 To 2047
Ret = Ret & Chr(192 + (CurAsc \ 64)) & Chr(128 + (CurAsc Mod 64))
Case 2048 To 65535
Ret = Ret & Chr(224 + (CurAsc \ 4096)) & Chr(128 + ((CurAsc \ 64) Mod 64)) & Chr(128 + (CurAsc Mod 64))
Case 65536 To 2097151
Ret = Ret & Chr(240 + (CurAsc \ 262144)) & Chr(128 + ((CurAsc \ 4096) Mod 64)) & Chr(128 + ((CurAsc \ 64) Mod 64)) & Chr(128 + (CurAsc Mod 64))
Case 2097152 To 67108863
Ret = Ret & Chr(248 + (CurAsc \ 16777216)) & Chr(128 + ((CurAsc \ 262144) Mod 64)) & Chr(128 + ((CurAsc \ 4096) Mod 64)) & Chr(128 + ((CurAsc \ 64) Mod 64)) & Chr(128 + (CurAsc Mod 64))
Case 67108864 To 2147483647
Ret = Ret & Chr(252 + (CurAsc \ 1073741824)) & Chr(128 + ((CurAsc \ 16777216) Mod 64)) & Chr(128 + ((CurAsc \ 262144) Mod 64)) & Chr(128 + ((CurAsc \ 4096) Mod 64)) & Chr(128 + ((CurAsc \ 64) Mod 64)) & Chr(128 + (CurAsc Mod 64))
End Select
Index = Index + 1
Loop
UCS2UTF = Ret
End FunctionThis is VB code, but I'm sure everybody will be able to translate this to his language of choice. I also think the conversion could be done more quickly with bitwise operations on the ASCII value. If anyone is going to implement a bitwise solution, feel free to post it here. |
| 09-15-2004, 12:43 AM | #2 |
Very useful to know that Warcraft uses UTF-8 encoding. Now I finally get those ä, ü and similar properly displayed. |
| 09-15-2004, 05:54 PM | #3 |
nice work pitza, i really need it ;) |
| 09-26-2004, 12:23 PM | #4 |
A funny thing is that SubString and similar natives work on ASCII level, meaning that UTF-8 characters are not always 1 character long for that native. The decoding of the UTF-8 is only done after passing the string to the display natives and similar. |
| 09-28-2004, 05:42 PM | #5 | |
Quote:
Indeed, and that's exactly how we can also speed up our tools. Internally we can work with the UTF characters as ASCII because it won't make a difference. Only for displaying text in our applications or for processing user input we need to do the conversion. |
| 10-30-2004, 07:53 PM | #6 |
Hmm, I think I'll make a 'C' version of this. |
| 11-01-2004, 06:57 AM | #7 |
Use Windows API: WideCharToMultiChar() and MultiCharToWideChar() to convert UTF-8/Unicode <-> ASCII 7bit The problem is that they could not be used outside windows platform. |
