Quantcast
Channel: VBForums - CodeBank - Visual Basic 6 and earlier
Viewing all articles
Browse latest Browse all 1532

Reading and Writing UTF-16 and UTF-8 Files

$
0
0
Ok, here's my procrastination for the day. I've long been able to read Unicode (UTF-16) files, but I decided I also wanted to read and write UTF-8 files, so I did it. The attached "test" project is the best way to get it, but here's the essential code for the file IO. Focus specifically on the ReadAsciiOrUnicodeNotepadFile and WriteAsciiOrUnicodeNotepadFile procedures. I thought about making them Get/Let properties, but I think they're better this way. Again, don't forget that the attached ZIP has a nice demo.

UTF8 and UTF16.zip

Code:

Option Explicit
'
Private Declare Function WideCharToMultiByte Lib "kernel32" (ByVal codepage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long, ByVal lpMultiByteStr As Long, ByVal cchMultiByte As Long, ByVal lpDefaultChar As Long, ByVal lpUsedDefaultChar As Long) As Long
Private Declare Function MultiByteToWideChar Lib "kernel32" (ByVal codepage As Long, ByVal dwFlags As Long, ByVal lpMultiByteStr As Long, ByVal cbMultiByte As Long, ByVal lpWideCharStr As Long, ByVal cchWideChar As Long) As Long
'
Private Const Utf8CodePage As Long = 65001
'
Public Enum AsciiUnicodeEncoding
    AsciiEncode = 0
    Utf8Encode = 1
    Utf16Encode = 2
End Enum
'

Public Function ReadAsciiOrUnicodeNotepadFile(sFileSpec As String) As String
    ' These are typically .TXT files.  They can be read with notepad.
    Dim iFle As Long
    Dim bb() As Byte
    Dim i As Integer
    Dim s As String
    '
    iFle = FreeFile
    Open sFileSpec For Binary As iFle
    If LOF(iFle) = 0 Then
        Close iFle
        Exit Function
    End If
    '
    Get iFle, , i
    Select Case i
    Case &HFEFF ' UTF16 file header.  First byte = FF, second byte = FE.
        ReDim bb(1 To LOF(iFle) - 2&)
        Get iFle, , bb
        ReadAsciiOrUnicodeNotepadFile = bb ' This directly copies the byte array to the Unicode string (no conversion).
    Case &HBBEF
        ReDim bb(1 To LOF(iFle) - 3&)
        Seek iFle, 4
        Get iFle, , bb
        ReadAsciiOrUnicodeNotepadFile = Utf8toUtf16(bb)
    Case Else ' Assume ascii.
        s = Space$(LOF(iFle))
        Seek iFle, 1
        Get iFle, , s
        ReadAsciiOrUnicodeNotepadFile = s
    End Select
    '
    Close iFle
End Function

Public Sub WriteAsciiOrUnicodeNotepadFile(sFileSpec As String, sData As String, Encoding As AsciiUnicodeEncoding)
    ' These are typically .TXT files.  They can be read with notepad.
    Dim iFle As Long
    '
    iFle = FreeFile
    Open sFileSpec For Binary As iFle
    Select Case Encoding
    Case AsciiEncode
        Put iFle, , sData
    Case Utf8Encode
        Put iFle, , CByte(&HEF)
        Put iFle, , CByte(&HBB)
        Put iFle, , CByte(&HBF)
        Put iFle, , Utf16toUtf8(sData)
    Case Utf16Encode
        Put iFle, , &HFEFF ' This is the Unicode header to a text file.  First byte = FF, second byte = FE.
        Put iFle, , Utf16ByteArrayFromString(sData)
    End Select
    Close iFle
End Sub

Public Function Utf16ByteArrayFromString(s As String) As Byte()
    ' This directly copies the Unicode string into the byte array, using two bytes per character (i.e., Unicode).
    Utf16ByteArrayFromString = s
End Function
 
Public Function Utf16toUtf8(s As String) As Byte()
    ' UTF-8 returned to VB6 as a byte array (zero based) because it's pretty useless to VB6 as anything else.
    Dim iLen As Long
    Dim bbBuf() As Byte
    '
    iLen = WideCharToMultiByte(Utf8CodePage, 0, StrPtr(s), Len(s), 0, 0, 0, 0)
    ReDim bbBuf(0 To iLen - 1) ' Will be initialized as all &h00.
    iLen = WideCharToMultiByte(Utf8CodePage, 0, StrPtr(s), Len(s), VarPtr(bbBuf(0)), iLen, 0, 0)
    Utf16toUtf8 = bbBuf
End Function
 
Public Function Utf8toUtf16(bb() As Byte) As String
    ' Incoming must be a dimensioned byte array with a UTF-8 string in it.
    Dim sBuf As String
    Dim iLen As Long
    '
    iLen = MultiByteToWideChar(Utf8CodePage, 0, VarPtr(bb(LBound(bb))), UBound(bb) - LBound(bb) + 1, 0, 0)
    sBuf = String$(iLen, 0)
    iLen = MultiByteToWideChar(Utf8CodePage, 0, VarPtr(bb(LBound(bb))), UBound(bb) - LBound(bb) + 1, StrPtr(sBuf), Len(sBuf))
    Utf8toUtf16 = sBuf
End Function

EDIT: This is in response to some of the following posts. If the above routine is to correctly read Unicode (UTF-16 and/or UTF-8), those files MUST have the Byte Order Marker (BOM) in the files. For UTF-16 files, they typically DO have their BOM. Many UTF-8 files also have this BOM header. If files are written by a relatively recent version of Windows Notepad, they will have these BOM markers, but there are Unicode files from sources other than notepad.

If you wish for a routine that reads Unicode files without the BOM header (which will primarily be UTF-8 files), you may want to consider incorporating Arnoutdv's routine (in Post #3 below) into your work. For further reading on this entire issue, the following link outlines the problems well:

http://blogs.msdn.com/b/oldnewthing/...7/2158334.aspx
Attached Files

Viewing all articles
Browse latest Browse all 1532

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>