// UTF8conv.VDM - Unicode UTF-8 to UTF-16 conversion of entire file. // // By: Ian Binnie // Last Change: 2008-04-04 // //---------------------------------------------------------------------------------------- // // Description: Translates UTF-8 to UTF-16LE (little-endian), usually used in Windows. // // Converts BOM if it exists // Basic error checking for invalid UTF-8 file // Ignores 4 byte sequences (Unicode characters > 0xFFFF) // // For more technical info see rfc3629 // // Requires: VEDIT for Windows 6.13 or later. // //---------------------------------------------------------------------------------------- // // Numeric Register Usage: // #40 // #41 // #42 // #43 Error Count // //---------------------------------------------------------------------------------------- #40 = Cur_Pos // current position // // For empty file, do nothing. // Begin_Of_File if (At_EOF) { if (Is_Quiet) { Exit(0) } else { Return } } // // Give confirmation prompt unless run via "-x" invocation option. // if (!(Is_Auto_Execution && Macro_Num == 100)) { Alert() #105 = Dialog_Input_1(0,"`Confirmation`,`OK to translate entire file from UTF-8 to UTF-16?`,`[&Yes]`,`[&No]`",APP+CENTER,0,0) if (#105 != 1) { Goto_Pos(#40) Return } } #43 = 0 while ( ! At_EOF) { if(Cur_Char<0x80) // 1 byte character { Char Ins_Char(0) } else { if(Cur_Char<0xE0) // 2 byte sequence { if(Cur_Char < 0xC2) // invalid { #43++ Char Continue } if( (cc(1) & 0xC0) != 0x80 ) // invalid { #43++ } #41 = (Cur_Char & 0x1C) >> 2 #42 = ((Cur_Char & 0x03) << 6) | (cc(1) & 0x3F) Ins_Char(#42, OVERWRITE) Ins_Char(#41, OVERWRITE) } else { if(Cur_Char<0xF0) // 3 byte sequence { if( ((cc(1) & 0xC0) != 0x80) || ((cc(2) & 0xC0) != 0x80) ) // invalid { #43++ } #41 = ((Cur_Char & 0x0F) << 4) | ((cc(1) & 0x3C) >> 2) #42 = ((cc(1) & 0x03) << 6) | (cc(2) & 0x3F) Ins_Char(#42, OVERWRITE) Ins_Char(#41, OVERWRITE) Del_Char(1) } else { if(Cur_Char > 0xF4) // invalid { #43++ Char Continue } Char(4) // skip 4 byte sequence } } } } // // If run via "-x" invocation option, save file and exit. // if (Is_Auto_Execution && Macro_Num == 100) { Xall(#43) } // // If there were errors, report how many // if (#43) { Update() Num_Str(#43, 43, LEFT) Dialog_Input_1(0, "`ERROR`, `This file contains |@(43) UTF-8 violations.`, `[OK]`", APP+CENTER,0,0) }