Converting Windows-1252 to HTML NCRs in VB6 (26th April 2011)

TextStudio is useless for nearly all users and purposes. But now it converts the ambigious characters from 128 up to 159 into their Named Character Reference (NCR) form. The sample uses my recently rationalised universal syntax highlighting.

Updating my whole site to match that will take quite a while! It’s basically to help my shift from numbers to names in entities.

My actual code is, of course, dreadful. But the array is Copy and Paste friendly.

Project Structure

Declare the Array, arrWindows1252

'Windows 1252 Character Conversion
Dim arrWindows1252(128 To 159) As String 'array of ambiguous codepoints

Initialise the Array with InitWindows1252()

Sub InitWindows1252()
'Note: Codepoints taken from here: http://www.i18nguy.com/markup/ncrs.html#t128159

'Initialise array:
arrWindows1252(128) = "€"     ' €
arrWindows1252(129) = "?"          ' unassigned
arrWindows1252(130) = "‚"    ' ‚
arrWindows1252(131) = "ƒ"     ' ƒ
arrWindows1252(132) = "„"    ' „
arrWindows1252(133) = "…"   ' …
arrWindows1252(134) = "†"   ' ‡
arrWindows1252(135) = "‡"   ' ‡
arrWindows1252(136) = "ˆ"     ' ˆ
arrWindows1252(137) = "‰"   ' ‰
arrWindows1252(138) = "Š"   ' Š
arrWindows1252(139) = "‹"   ' ‹
arrWindows1252(140) = "Œ"    ' Œ
arrWindows1252(141) = "?"          ' unassigned
arrWindows1252(142) = "Ž"     ' Ž
arrWindows1252(143) = "?"          ' unassigned
arrWindows1252(144) = "?"          ' unassigned
arrWindows1252(145) = "‘"    ' ‘
arrWindows1252(146) = "’"    ' ’
arrWindows1252(147) = "“"    ' “
arrWindows1252(148) = "”"    ' ”
arrWindows1252(149) = "•"     ' •
arrWindows1252(150) = "–"    ' –
arrWindows1252(151) = "—"    ' —
arrWindows1252(152) = "˜"    ' ˜
arrWindows1252(153) = "™"    ' ™
arrWindows1252(154) = "š"   ' š
arrWindows1252(155) = "›"   ' ›
arrWindows1252(156) = "œ"    ' œ
arrWindows1252(157) = "?"          ' unassigned
arrWindows1252(158) = "ž"     ' ž
arrWindows1252(159) = "Ÿ"     ' Ÿ

End Sub

Trigger the Conversion from mnuFormatHtmlEntitiesEscape_Click()

Public Sub mnuFormatHtmlEntitiesEscape_Click(Index As Integer)

'Variables:
Dim i As Long
Dim Length As Long 'length of selected text
Dim Replaced As Boolean 'result of non-ASCII escape function'No files?
If Not IsAnyFileOpen Then Exit Sub
    
'Prepare:
Screen.MousePointer = vbHourglass
strText = ActiveForm.rtfBox.SelText

Select Case Index 'index of button pressed:
    Case 0 'Escape HTML Tokens:Case 1 'Escape Non-ASCII:
        'Prepare:
        StatusBar.Panels(1) = "Checking file..."
        
        'Run through all text in active file:
        Replaced = EscapeNonASCII(ActiveForm.Tag) 'store status of module function
    Case 2 'Escape Non-ASCII From All Files:Case 3 'Escape Selected:End Select

'Any non-ASCII characters replaced?
If Replaced Then
    StatusBar.Panels(1) = "Escaped non-ASCII characters in active file"
Else
    StatusBar.Panels(1) = "No non-ASCII characters found in active file"
End If

'Recover:
Call UpdateEdit
Call UpdateFileSizePanel
ActiveForm.rtfBox.SetFocus

'Recover:
Screen.MousePointer = vbNormal

End Sub

Do the Conversion with EscapeNonASCII()

Function EscapeNonASCII(ID As Long) As Boolean

'Variables:
Dim i As Long: i = 1
Dim Length As Long 'length of selected text
Dim Char As String * 1 'character currently being tested
Dim strText As String 'text to be replaced
Dim Completed As Boolean 'indicates replacement has finished
Dim AnySelected As Boolean 'indicates if any text is selected
'Note: Also uses arrWindows1252() array, initalised above.

'Any text is selected:
If ChildForm(ID).rtfBox.SelLength <> 0 Then 'some text is selected:
    AnySelected = True 'raise flag
    strText = ChildForm(ID).rtfBox.SelText
Else
    AnySelected = False 'lower flag
    strText = ChildForm(ID).rtfBox.Text
End If

'Run through all text in active file:
Do Until Completed 'for all characters:
    'Store current length of selected text:
    Length = Len(strText)
    
    'Entire string has been searched:
    If i >= Length Then 'entire text searched:
        Completed = True 'indicate replacement has finished
    End If
    
    'Store current character:
    Char = Mid$(strText, i, 1)
    
    'Current character is a printable ASCII character:
    Select Case Asc(Char) 'value of this character
        Case LBound(arrWindows1252) To UBound(arrWindows1252) 'ambigious characters from 128-159:
            'Replace character with corresponding array entry:
            strText = Mid$(strText, 1, i - 1) & arrWindows1252(Asc(Char)) _
                      & Mid$(strText, i + 1, Length - i)
        Case 0 To 8, 14 To 31, Is >= 159 'not a printable ASCII character:
            'Replace it with a decimal entity reference:
            strText = Mid$(strText, 1, i - 1) & "&#" & Asc(Char) & CHAR_SEMICOLON _
                      & Mid$(strText, i + 1, Length - i)
            'Note: String so far + Char (converted into a decimal entity reference) + rest of string.
            
            'Resume processing after the entity reference, which could be several digits long:
            i = i + Len(CStr("&#" & Asc(Char) & CHAR_SEMICOLON))
            
            'Function has replaced something:
            EscapeNonASCII = True 'raise flag being returned
            CStatus(ID).Saved = False 'file will be altered
        Case Else 'is a printable ASCII character:
            i = i + 1 'advance to next character
    End Select
Loop
'Note: Does not escape whitespace characters. Specifically:
'   Horizontal Tab (9)
'   Line Feed (10)
'   Vertical Tab (11)
'   Form Feed (12)
'   Carriage Return (13).

'any text is selected:
If AnySelected Then 'some text was selected:
    ChildForm(ID).rtfBox.SelText = strText 'output resulting string
Else
    With ChildForm(ID).rtfBox
        'Select All:
        i = .SelStart
        Length = .SelLength
        .SelStart = 0
        .SelLength = Len(.Text)
        
        'Replacing a selection enables native Undo:
        .SelText = strText
        
        'Finish:
        .SelStart = i
        .SelLength = Length
    End With
End If

End Function