Author Topic: ScriptBasic Character Encoding Functions  (Read 2521 times)

Offline John

  • Forum Support / SB Dev
  • Posts: 2752
    • ScriptBasic Open Source Project
ScriptBasic Character Encoding Functions
« on: April 09, 2011, 09:59:32 PM »
I have been wading through the files that make up the ScriptBasic distribution in an effort to organize the code better. I found a couple character encoding examples that may come in handy if you need to translate strings of text.

UNICODE

Code: [Select]
module unicode

declare option DeclareVars

' ---------------------------------------------
REM some fundamental constants

UNI_REPLACEMENT_CHAR = 0x0000FFFD
UNI_MAX_BMP          = 0x0000FFFF
UNI_MAX_UTF16        = 0x0010FFFF
UNI_MAX_UTF32        = 0x7FFFFFFF
UNI_MAX_LEGAL_UTF32  = 0x0010FFFF

' ---------------------------------------------
REM error codes returned by the functions

REM conversion successful
conversionOK    = 0
REM partial character in source, but hit end
sourceExhausted = 1
REM insuff. room in target for conversion
targetExhausted = 2
REM source sequence is illegal/malformed
sourceIllegal   = 3

' ---------------------------------------------
REM conversion type contants
strictConversion  = 0
lenientConversion = 1

REM used for shifting by 10 bits */
halfShift = 2#10000000000

halfBase = 0x0010000
halfMask = 0x3FF

UNI_SUR_HIGH_START  = 0xD800
UNI_SUR_HIGH_END    = 0xDBFF
UNI_SUR_LOW_START   = 0xDC00
UNI_SUR_LOW_END     = 0xDFFF

function ConvertUTF32toUTF16 (source, flags)
  local result
  result = conversionOK
  local target
  target = ""
  while len(source) > 0
  local ch
  ' fetch the next four bytes from the source
  unpack source by "U4" to ch
  source = mid(source,4)
  ' target is a character <= 0xFFFF
  if ch <= UNI_MAX_BMP then
    ' UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
    if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_LOW_END then
      if flags == strictConversion then
        result = sourceIllegal;
        exit function
      else
       target = target & pack("U2",UNI_REPLACEMENT_CHAR)
      endif
    else
      target = target & pack("U2",ch)
    end if
  elsif ch > UNI_MAX_LEGAL_UTF32 then
    if flags == strictConversion then
      result = sourceIllegal;
      exit function
    else
     target = target & pack("U2",UNI_REPLACEMENT_CHAR)
    endif
  else
   ' target is a character in range 0xFFFF - 0x10FFFF
   ch -= halfBase
   target = target & pack("U2", ((ch * halfShift) + UNI_SUR_HIGH_START) )
   target = target & pack("U2", ((ch and halfMask) + UNI_SUR_LOW_START) )
  wend
  ConvertUTF32toUTF16 = target
end function

end module

UTF8 / iso-latin-1 conversion funtions

Code: [Select]
module utf8

declare option DeclareVars

' """
Convert an iso-latin-1 encoded string to UTF8

The first argument of the function is the string containing the ISO 8859-1
encoded string. The second argument is optional. If this is present and
is TRUE then then the conversion function will put a three byte BOM in in front
of the resulting string. This is generally used in text files to denote that
the content of the file is UTF-8 encoded.

The output of the function is the UTF-8 encoded string.
"""

function iso1_to_utf8(IsoString,UseBom)
  local UtfString
  ' initialize the output string to contain no characters
  UtfString = ""
  ' if the Byte Order Mark is to be used, then put it in
  ' front of the output string
  if UseBom then
    UtfString = "\xEF\xBB\xBF"
  end if
  local CharacterIndex
  for CharacterIndex = 1 to len(IsoString)
    local ActualCharacter
    ActualCharacter = mid$(IsoString,CharacterIndex,1)
    if asc(ActualCharacter) < 0x80 then
      UtfString = UtfString & ActualCharacter
    else
      if asc(ActualCharacter) < 0xC0 then
        UtfString = UtfString & "\xC2" & ActualCharacter
      else
        UtfString = UtfString & "\xC3" & chr(asc(ActualCharacter)-64)
      endif
    end if
  next
  iso1_to_utf8 = UtfString
end function

' """
Convert an UTF8 encoded string to iso-latin-1

The first argument of the function is the string containing the UTF-8 encoded
string. The second argument is optional. If this is present and
is TRUE then then the conversion function will IGNORE the three byte BOM in
in front of the utf-8 string, even if it is there.
This is generally used in text files to denote that the content of the file is UTF-8 encoded.

The output of the function is the ISO 8859-1 encoded string.
"""

function utf8_to_iso1(UtfString,IgnoreBom)
  local IsoString
  ' initialize the output string to contain no characters
  IsoString = ""
  ' if there is a bom at the start of the string then chop it off
  if not IgnoreBom and _
     asc(mid(UtfString,1,1)) = 0xFE and _
     asc(mid(UtfString,2,1)) = 0xBB and _
     asc(mid(UtfString,3,1)) = 0xBF then
     UtfString = mid(UtfString,4)
  end if
  local CharacterIndex
  ' Get each byte. NOTE: 'len' returns the bytes and not the character count
  for CharacterIndex = 1 to len(UtfString)
    local ActualCharacter
    ActualCharacter = mid$(UtfString,CharacterIndex,1)
    if asc(ActualCharacter) = 0xC2 then
      CharacterIndex = CharacterIndex + 1
      ActualCharacter = mid$(UtfString,CharacterIndex,1)
      IsoString = IsoString & ActualCharacter
    else if asc(ActualCharacter) = 0xC3 then
      CharacterIndex = CharacterIndex + 1
      ActualCharacter = mid$(UtfString,CharacterIndex,1)
      IsoString = IsoString & chr(asc(ActualCharacter)+64)
    else
      IsoString = IsoString & ActualCharacter
    end if
  next
  utf8_to_iso1 = IsoString
end function

end module

open "test.txt" for output as 1
print#1, utf8::iso1_to_utf8("halihõõõõ",true)
close 1

print "halihõõõí\n",utf8::utf8_to_iso1(utf8::iso1_to_utf8("halihõõõí"))
« Last Edit: April 09, 2011, 10:04:45 PM by ABB »