// UTF_ANSI.vdm - Unicode (UTF-16) to ANSI conversion of entire file. // Author:- // Ian Binnie and Christian Ziemski // ian_binnie at optusnet dot com dot au ( replace "at" "dot" by the normal characters.) // // 31 May 2004 by Ian Binnie based on unic-asc.vdm by Christian Ziemski // // modified: 24 Jun 2004 by Christian Ziemski Completely new technique (idea by Ted). Single pass now! // Some fixes regarding register usage etc. // Added counter for not translated unicode characters // // Last change: 25 Jun 2004 by Christian Ziemski Added choice to force translation, even if it seems to be no Unicode file. // New format for the translation table: easier to maintain. // Removed the prompts for DOS VEDIT // Added another translation table: OEM // //---------------------------------------------------------------------------------------- // // Description: This macro converts UTF-16 to ANSI (Windows Code Page 1252) // Converts Microsoft Unicode (UTF-16LE) or little-endian files only. // If the Unicode file contains characters for which there is no mapping // the result is undefined, but an error is indicated. // // See http://www.alanwood.net/demos/ansi.html // // Requires: VEDIT for Windows 6.02 or later. // //---------------------------------------------------------------------------------------- if (Font_Charset == 0) { Reg_Set(105,"ANSI") } else { Reg_Set(105,"OEM") } #106 = Cur_Pos // current position // Check that the file is recognized as Unicode UTF-16 Begin_Of_File if (Cur_Char!=255 || CC(+1)!=254) { // UTF BOM not found // This is not an error: Unicode does not require BOM repeat(5) { // Check at least 5 lines with UTF-16 CRLF in little-endian format Search("|H0D",NOERR) if (! Error_Match && Match("|H0D|000|H0A|000")==0) { Char Continue } else { Alert() #104 = Dialog_Input_1(121,"`ERROR - Unicode UTF-16 to |@(105)`, `This file seems to be either not Unicode\n or is a Unicode file that is not supported by VEDIT.`, `But you can force the translation anyway, with unknown results.`, `[&Quit]`,`[&Force]`",APP+CENTER,0,0) if ((#104 == 0) || (#104 == 2)) { break } Goto_Pos(#106) Return } } } // // Give confirmation prompt unless run via "-x" invocation option. // if (!(Is_Auto_Execution && Macro_Num == 100)) { Alert() #104 = Dialog_Input_1(121,"`Confirmation`,`OK to translate entire file from Unicode to |@(105)?`,`[&Yes]`,`[&No]`",APP+CENTER,0,0) if (#104 != 1) { Goto_Pos(#106) Return } } // // preparation for the translation // Num_Push(90,95) #90=Buf_Num // current buffer #91=Buf_Switch(Buf_Free) // working buffer for translation table // The translation tables for characters corresponding to ANSI 0x80 - 0x9F // or VEDIT OEM ... (to fill below) // One line per character: // Two hex bytes for the source UTF-16 character // One hex byte for the target ANSI/OEM character // An optional description if (Font_Charset == 0) { // ANSI Charset - Code Page 1252 Ins_Text(" AC 20 80 EURO SIGN 1A 20 82 SINGLE LOW-9 QUOTATION MARK 92 01 83 LATIN SMALL LETTER F WITH HOOK 1E 20 84 DOUBLE LOW-9 QUOTATION MARK 26 20 85 HORIZONTAL ELLIPSIS 20 20 86 DAGGER 21 20 87 DOUBLE DAGGER C6 02 88 MODIFIER LETTER CIRCUMFLEX ACCENT 30 20 89 PER MILLE SIGN 60 01 8A LATIN CAPITAL LETTER S WITH CARON 39 20 8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK 52 01 8C LATIN CAPITAL LIGATURE OE 7D 01 8E LATIN CAPITAL LETTER Z WITH CARON 18 20 91 LEFT SINGLE QUOTATION MARK 19 20 92 RIGHT SINGLE QUOTATION MARK 1C 20 93 LEFT DOUBLE QUOTATION MARK 1D 20 94 RIGHT DOUBLE QUOTATION MARK 22 20 95 BULLET 13 20 96 EN DASH 14 20 97 EM DASH DC 02 98 SMALL TILDE 22 21 99 TRADE MARK SIGN 61 01 9A LATIN SMALL LETTER S WITH CARON 3A 20 9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 53 01 9C LATIN SMALL LIGATURE OE 7E 01 9E LATIN SMALL LETTER Z WITH CARON 78 01 9F LATIN CAPITAL LETTER Y WITH DIAERESIS ") } else { // OEM Charset CodePage 437 Ins_Text(" C7 00 80 FC 00 81 E9 00 82 E2 00 83 E4 00 84 E0 00 85 E5 00 86 E7 00 87 EA 00 88 EB 00 89 E8 00 8A EF 00 8B EE 00 8C EC 00 8D C4 00 8E C5 00 8F C9 00 90 E6 00 91 C6 00 92 F4 00 93 F6 00 94 F2 00 95 FB 00 96 F9 00 97 FF 00 98 D6 00 99 DC 00 9A A2 00 9B A3 00 9C A5 00 9D A7 20 9E 92 01 9F E1 00 A0 ED 00 A1 F3 00 A2 FA 00 A3 F1 00 A4 D1 00 A5 AA 00 A6 BA 00 A7 BF 00 A8 10 23 A9 AC 00 AA BD 00 AB BC 00 AC A1 00 AD AB 00 AE BB 00 AF 91 25 B0 92 25 B1 93 25 B2 02 25 B3 24 25 B4 61 25 B5 62 25 B6 56 25 B7 55 25 B8 63 25 B9 51 25 BA 57 25 BB 5D 25 BC 5C 25 BD 5B 25 BE 10 25 BF 14 25 C0 34 25 C1 2C 25 C2 1C 25 C3 00 25 C4 3C 25 C5 5E 25 C6 5F 25 C7 5A 25 C8 54 25 C9 69 25 CA 66 25 CB 60 25 CC 50 25 CD 6C 25 CE 67 25 CF 68 25 D0 64 25 D1 65 25 D2 59 25 D3 58 25 D4 52 25 D5 53 25 D6 6B 25 D7 6A 25 D8 18 25 D9 0C 25 DA 88 25 DB 84 25 DC 8C 25 DD 90 25 DE 80 25 DF B1 03 E0 DF 00 E1 93 03 E2 C0 03 E3 A3 03 E4 C3 03 E5 B5 00 E6 C4 03 E7 A6 03 E8 98 03 E9 A9 03 EA B4 03 EB 1E 22 EC C6 03 ED B5 03 EE 29 22 EF 61 22 F0 B1 00 F1 65 22 F2 64 22 F3 20 23 F4 21 23 F5 F7 00 F6 48 22 F7 B0 00 F8 19 22 F9 B7 00 FA 1A 22 FB 7F 20 FC B2 00 FD A0 25 FE A0 00 FF ") } // // The translation table has to be converted from text to binary format now // Replace("|<|[|W]|N","",BEGIN+ALL+NOERR) // remove empty lines Replace("|<|W","",BEGIN+ALL+NOERR) // remove leading whitespace BoF while ( ! At_EoF) { for (#95=1 ; #95<=3 ; #95++) { // three hex values per line are converted to binary #92=0 // one byte for (#103=1; #103>=0 ; #103--) { // two nibbles per byte, high to low if ( Cur_Char >= '0' && Cur_Char <= '9' ) { #105 = Cur_Char - '0' } else { #104=Cur_Char&0HDF if ( #104 >= 'A' && #104 <= 'F' ) { #105 = 10 + #104 - 'A' } else { Alert() Message("*****Bad input:\n") Type(0) Type_Newline() Type(1) Get_Key("Press any key to quit...") Num_Pop(90,95) return }} #92 = #92 + (#105 << 4*#103) // build the byte Del_Char(1) // delete current nibble } Ins_Char(#92) // insert byte if ( (#95<=2) && (Match("|X")==0) ) { Del_Char(Chars_Matched) } // delete intermediate whitespace } Line(1,NOERR+ERRBREAK) // next character } // Now translate the Unicode codepoints using the table above Buf_Switch(#90) // back to the original text Begin_Of_File() if (Match("|HFF|HFE")==0) { Del_Char(2) } // Delete BOM // // Main loop to convert file // #121=0 // warning flag for incomplete success of conversion // // T-Reg(104) holds the special character conversion routine // Reg_Set(104,` Reg_Copy_Block(103,Cur_Pos-1,Cur_Pos+1) Buf_Switch(#91) Search("|@(103)",BEGIN+ADVANCE+NOERR) if (Error_Match) { #121++ Buf_Switch(#90) Char(-1) Ins_Char(127,OVERWRITE) } else { #105=Cur_Char Buf_Switch(#90) Char(-1) Ins_Char(#105,OVERWRITE) } `) // // The main loop is speed optimized by using abbreviated commands // and no whitespace. (Harder to read, but faster.) // If Font is ANSI: // check every second byte // If it's not a |H00: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte // If Font is OEM: // check 8th bit of first byte and every second byte // If it's not a |H000: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte if (Font_Charset==0) { while (! At_EOF) {C if(CC){call(104)} DC} } else { while (! At_EOF) {C if(CC||(CC(-1)&0x80)){call(104)} DC} } Buf_Switch(#91) // working buffer with conversion table Buf_Quit(OK) // close it Buf_Switch(#90) // back to original buffer Num_Pop(90,95) // If run via "-x" invocation option, save file and exit. if (Is_Auto_Execution && Macro_Num == 100) { Xall } // Go to end of first line and determine file type Begin_Of_File() #103 = 64 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE,#103) // If file size < 1 meg, restore (approximate) cursor position. if (File_Size < 1000000) { Goto_Pos(#106/2 -1) } // check for incomplete conversion if (#121) { Update() Num_Str(#121,103,LEFT) Dialog_Input_1(104,"`ERROR - Unicode to |@(105)`, `This file contained |@(103) Unicode character(s)\nthat could not be translated to |@(105)`", APP+CENTER,0,0) }