Converting Windows-1252 to HTML NCRs in VB6 (26th April 2011)
TextStudio is useless for nearly all users and purposes. But now it converts the ambigious characters from 128
up to 159
into their Named Character Reference (NCR) form. The sample uses my recently rationalised universal syntax highlighting.
Updating my whole site to match that will take quite a while! It’s basically to help my shift from numbers to names in entities.
My actual code is, of course, dreadful. But the array is Copy and Paste friendly.
Project Structure
prjTextStudio.vbp
mdiMain.frm
mnuFormatHtmlEntitiesEscape
mnuFormatHtmlEntitiesEscape_Click()
modFormatting.bas
arrWindows1252
InitWindows1252()
EscapeNonASCII()
Declare the Array, arrWindows1252
'Windows 1252 Character Conversion
Dim arrWindows1252(128 To 159) As String 'array of ambiguous codepoints
Initialise the Array with InitWindows1252()
Sub InitWindows1252()
'Note: Codepoints taken from here: http://www.i18nguy.com/markup/ncrs.html#t128159
'Initialise array:
arrWindows1252(128) = "€" ' €
arrWindows1252(129) = "?" ' unassigned
arrWindows1252(130) = "‚" ' ‚
arrWindows1252(131) = "ƒ" ' ƒ
arrWindows1252(132) = "„" ' „
arrWindows1252(133) = "…" ' …
arrWindows1252(134) = "†" ' ‡
arrWindows1252(135) = "‡" ' ‡
arrWindows1252(136) = "ˆ" ' ˆ
arrWindows1252(137) = "‰" ' ‰
arrWindows1252(138) = "Š" ' Š
arrWindows1252(139) = "‹" ' ‹
arrWindows1252(140) = "Œ" ' Œ
arrWindows1252(141) = "?" ' unassigned
arrWindows1252(142) = "Ž" ' Ž
arrWindows1252(143) = "?" ' unassigned
arrWindows1252(144) = "?" ' unassigned
arrWindows1252(145) = "‘" ' ‘
arrWindows1252(146) = "’" ' ’
arrWindows1252(147) = "“" ' “
arrWindows1252(148) = "”" ' ”
arrWindows1252(149) = "•" ' •
arrWindows1252(150) = "–" ' –
arrWindows1252(151) = "—" ' —
arrWindows1252(152) = "˜" ' ˜
arrWindows1252(153) = "™" ' ™
arrWindows1252(154) = "š" ' š
arrWindows1252(155) = "›" ' ›
arrWindows1252(156) = "œ" ' œ
arrWindows1252(157) = "?" ' unassigned
arrWindows1252(158) = "ž" ' ž
arrWindows1252(159) = "Ÿ" ' Ÿ
End Sub
Trigger the Conversion from mnuFormatHtmlEntitiesEscape_Click()
Public Sub mnuFormatHtmlEntitiesEscape_Click(Index As Integer)
'Variables:
Dim i As Long
Dim Length As Long 'length of selected text
Dim Replaced As Boolean 'result of non-ASCII escape function
…
'No files?
If Not IsAnyFileOpen Then Exit Sub
'Prepare:
Screen.MousePointer = vbHourglass
strText = ActiveForm.rtfBox.SelText
Select Case Index 'index of button pressed:
Case 0 'Escape HTML Tokens:
…
Case 1 'Escape Non-ASCII:
'Prepare:
StatusBar.Panels(1) = "Checking file..."
'Run through all text in active file:
Replaced = EscapeNonASCII(ActiveForm.Tag) 'store status of module function
Case 2 'Escape Non-ASCII From All Files:
…
Case 3 'Escape Selected:
…
End Select
'Any non-ASCII characters replaced?
If Replaced Then
StatusBar.Panels(1) = "Escaped non-ASCII characters in active file"
Else
StatusBar.Panels(1) = "No non-ASCII characters found in active file"
End If
'Recover:
Call UpdateEdit
Call UpdateFileSizePanel
ActiveForm.rtfBox.SetFocus
'Recover:
Screen.MousePointer = vbNormal
End Sub
Do the Conversion with EscapeNonASCII()
Function EscapeNonASCII(ID As Long) As Boolean
'Variables:
Dim i As Long: i = 1
Dim Length As Long 'length of selected text
Dim Char As String * 1 'character currently being tested
Dim strText As String 'text to be replaced
Dim Completed As Boolean 'indicates replacement has finished
Dim AnySelected As Boolean 'indicates if any text is selected
'Note: Also uses arrWindows1252() array, initalised above.
'Any text is selected:
If ChildForm(ID).rtfBox.SelLength <> 0 Then 'some text is selected:
AnySelected = True 'raise flag
strText = ChildForm(ID).rtfBox.SelText
Else
AnySelected = False 'lower flag
strText = ChildForm(ID).rtfBox.Text
End If
'Run through all text in active file:
Do Until Completed 'for all characters:
'Store current length of selected text:
Length = Len(strText)
'Entire string has been searched:
If i >= Length Then 'entire text searched:
Completed = True 'indicate replacement has finished
End If
'Store current character:
Char = Mid$(strText, i, 1)
'Current character is a printable ASCII character:
Select Case Asc(Char) 'value of this character
Case LBound(arrWindows1252) To UBound(arrWindows1252) 'ambigious characters from 128-159:
'Replace character with corresponding array entry:
strText = Mid$(strText, 1, i - 1) & arrWindows1252(Asc(Char)) _
& Mid$(strText, i + 1, Length - i)
Case 0 To 8, 14 To 31, Is >= 159 'not a printable ASCII character:
'Replace it with a decimal entity reference:
strText = Mid$(strText, 1, i - 1) & "&#" & Asc(Char) & CHAR_SEMICOLON _
& Mid$(strText, i + 1, Length - i)
'Note: String so far + Char (converted into a decimal entity reference) + rest of string.
'Resume processing after the entity reference, which could be several digits long:
i = i + Len(CStr("&#" & Asc(Char) & CHAR_SEMICOLON))
'Function has replaced something:
EscapeNonASCII = True 'raise flag being returned
CStatus(ID).Saved = False 'file will be altered
Case Else 'is a printable ASCII character:
i = i + 1 'advance to next character
End Select
Loop
'Note: Does not escape whitespace characters. Specifically:
' Horizontal Tab (9)
' Line Feed (10)
' Vertical Tab (11)
' Form Feed (12)
' Carriage Return (13).
'any text is selected:
If AnySelected Then 'some text was selected:
ChildForm(ID).rtfBox.SelText = strText 'output resulting string
Else
With ChildForm(ID).rtfBox
'Select All:
i = .SelStart
Length = .SelLength
.SelStart = 0
.SelLength = Len(.Text)
'Replacing a selection enables native Undo:
.SelText = strText
'Finish:
.SelStart = i
.SelLength = Length
End With
End If
End Function