// UTF-ANSI.VDM - Unicode (UTF-16) to ANSI/OEM conversion of entire file. // // By: Ian Binnie and Christian Ziemski // Change: 01-Sept-2004. // Last Change: 07-June-2008 CZ: Little enhancement regarding output of percentage // // Edit usage: {EDIT, Translate, Unicode to ASCII}. // // Filter usage: vpw -w -s0 -q utf-ansi.vdm pathname // //---------------------------------------------------------------------------------------- // // Description: Translates Unicode UTF-16 to 8-bit ASCII with the same character set - // ANSI (Code Page 1252) or OEM (Code Page 437) as the current VEDIT font. // It can handle UTF-16LE (little-endian), usually used in Windows, // and UTF-16BE (big-endian) files. // // If the Unicode file contains characters for which there is no mapping // these characters are translated to 0x7F and the number of them is // reported at end. // Optionally they are listed in an extra window for diagnostics. // // For more technical infos see (for example): // http://www.alanwood.net/demos/ansi.html // or: ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT // // or: Full Unicode reference at http://www.unicode.org/charts // // Requires: VEDIT for Windows 6.13 or later. // //---------------------------------------------------------------------------------------- // // Numeric Register Usage: // // Return (error) code (invocation parameter "-x" and/or "-q"): // // #80 Exit code for small # of errors (default is 0; edit to 1, if desired; // all later return codes are > #80 // #81 Exit level #80 if < #81% errors // #82 Exit level #80+1 if < #82% errors // #83 Exit level #80+2 if < #83% errors // #84 Quit, returning error level #85 if # errors ever > #84% // (Only used for "-q" quiet mode) // #85 Error Level for too many errors (10; see below) // #86 Corrupted translation tables error level; quit with error level 11 // #88 File-size of ANSI file (1/2 of input file) // // Other usage: (Regs 90-98 are pushed/popped) // // #90 ID of Current Buffer // #91 ID of Translation Work Buffer // #92,#95 Compiling the Translation Table from its text source herein // #96 Flag for endianess via BOM // #97 Flag for endianess by guessing // #98 ID of buffer for not translated characters // #100 Edit position on entry // #103-#106 ... // #121 Warning count/flag // //---------------------------------------------------------------------------------------- #100 = Cur_Pos // current position // // For empty file, do nothing. // BoF if (At_EoF){ if (Is_Quiet) { Exit(0) } else { return } } // // Error code levels for auto-execution mode. // // Note: generally, there will be no interaction with other macros when run // "auto-execution"; thus, there is no need to save numeric registers. // if (Is_Auto_Execution && Macro_Num == 100) { #80 = 0 // Default error level for "just a few" number of unknown chars; // set to 1 for "no errors allowed"; // higher error levels are based on this value #81 = 2 // Percentage for "just a few" number of errors #82 = 10 // 10% #83 = 15 // 15% #84 = 33 // Percentage to "abort" the translation process at the // instance it occurs. ("Quits" the conversion, // returning error level #85). #85 = 10 // Error level for "too many errors" #86 = 11 // Error level for corrupted translation table #88 = File_Size >> 1 // Size of target file = 1/2 input file } // // Determine whether ANSI or OEM character mode. // if (Font_Charset == 0) { Reg_Set(105,"ANSI") } else { Reg_Set(105,"OEM") } // // Check whether this file is possibly a Unicode UTF-16 file and // determine its byte-order (endianess) if possible // - via (optional) BOM // - or by counting 0x00-bytes at odd and even byte-positions // Begin_Of_File #103=0 // endianess via BOM (1 = big, 2 = little endian) #104=0 // endianess, guessed if no BOM present #105=0 // counter #106=0 // counter if (Match("|HFE|HFF")==0) { // big-endian BOM found #103=1 } else { if (Match("|HFF|HFE")==0) { // little-endian BOM found #103=2 } else { // no UTF BOM found (This is no error: Unicode does not require BOM) // so try to guess the endianess Repeat(Min(50, File_Size/2)) { // read max. 50 characters if (Cur_Char==0) { #105++ } // count 0x00 at odd byte positions Char if (Cur_Char==0) { #106++ } // count 0x00 at even byte positions Char } if (#105*3 > Cur_Pos && #106*5 < Cur_Pos) { #104 = 1 // Big endian } else { if (#106*3 > Cur_Pos && #105*5 < Cur_Pos) { #104 = 2 // Little endian } } if(#104==0) { // if not an UTF-16 file if (Is_Quiet) { Exit(12) // exit macro with return code 12 } else { Alert() #105 = Dialog_Input_1(121,"`ERROR - Unicode UTF-16 to |@(105)`, `This file seems to be either not Unicode\nor is a Unicode file that is not supported by VEDIT.`, `But you can force the translation anyway, with unknown results.`, ` Force LE\tassumes a byte order used on x86 architectures etc.\n Force BE\tassumes a byte order used on many RISC architectures etc.`, `[&Quit]`,`[&Force LE]`,`[&Force BE]`",APP+CENTER,0,0) if (#105 < 2 ) { // Quit or Esc Goto_Pos(#100) Return } if (#105 == 2) { // Force little-endian #104=2 } if (#105 == 3) { // Force big-endian #104=1 } } } } } // // Give confirmation prompt unless run via "-x" invocation option. // if (!(Is_Auto_Execution && Macro_Num == 100)) { Alert() #105 = Dialog_Input_1(121,"`Confirmation`,`OK to translate entire file from Unicode to |@(105)?`,`[&Yes]`,`[&No]`",APP+CENTER,0,0) if (#105 != 1) { Goto_Pos(#100) Return } } // // Prepare for the translation. // Num_Push(90,98) #96=#103 // endianess via BOM, else 0 #97=#104 // guessed endianess if no BOM, else 0 #90=Buf_Num // current buffer #91=Buf_Switch(Buf_Free) // working buffer for translation table Config(F_F_TYPE,0, LOCAL) // to be safe Config(F_OVER_MODE, "Overwrite-only mode (*) (0=Off, 1=Rec, 2=All)", 0 , LOCAL) // The translation tables for characters corresponding to ANSI 0x80 - 0x9F // or VEDIT OEM 0x80 - 0xFF // One line per character: // Two hex bytes for the source UTF-16 character in little-endian order // One hex byte for the target ANSI/OEM character // An optional description if (Font_Charset == 0) { // ANSI Charset - Code Page 1252 Ins_Text(" AC 20 80 € EURO SIGN 1A 20 82 ‚ SINGLE LOW-9 QUOTATION MARK 92 01 83 ƒ LATIN SMALL LETTER F WITH HOOK 1E 20 84 „ DOUBLE LOW-9 QUOTATION MARK 26 20 85 … HORIZONTAL ELLIPSIS 20 20 86 † DAGGER 21 20 87 ‡ DOUBLE DAGGER C6 02 88 ˆ MODIFIER LETTER CIRCUMFLEX ACCENT 30 20 89 ‰ PER MILLE SIGN 60 01 8A Š LATIN CAPITAL LETTER S WITH CARON 39 20 8B ‹ SINGLE LEFT-POINTING ANGLE QUOTATION MARK 52 01 8C Œ LATIN CAPITAL LIGATURE OE 7D 01 8E Ž LATIN CAPITAL LETTER Z WITH CARON 18 20 91 ‘ LEFT SINGLE QUOTATION MARK 19 20 92 ’ RIGHT SINGLE QUOTATION MARK 1C 20 93 “ LEFT DOUBLE QUOTATION MARK 1D 20 94 ” RIGHT DOUBLE QUOTATION MARK 22 20 95 • BULLET 13 20 96 – EN DASH 14 20 97 — EM DASH DC 02 98 ˜ SMALL TILDE 22 21 99 ™ TRADE MARK SIGN 61 01 9A š LATIN SMALL LETTER S WITH CARON 3A 20 9B › SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 53 01 9C œ LATIN SMALL LIGATURE OE 7E 01 9E ž LATIN SMALL LETTER Z WITH CARON 78 01 9F Ÿ LATIN CAPITAL LETTER Y WITH DIAERESIS ") } else { // OEM Charset CodePage 437 Ins_Text(" C7 00 80 € Latin capital letter C with cedilla FC 00 81 Latin small letter u with diaeresis E9 00 82 ‚ Latin small letter e with acute E2 00 83 ƒ Latin small letter a with circumflex E4 00 84 „ Latin small letter a with diaeresis E0 00 85 … Latin small letter a with grave E5 00 86 † Latin small letter a with ring above E7 00 87 ‡ Latin small letter c with cedilla EA 00 88 ˆ Latin small letter e with circumflex EB 00 89 ‰ Latin small letter e with diaeresis E8 00 8A Š Latin small letter e with grave EF 00 8B ‹ Latin small letter i with diaeresis EE 00 8C Œ Latin small letter i with circumflex EC 00 8D Latin small letter i with grave C4 00 8E Ž Latin capital letter A with diaeresis C5 00 8F Latin capital letter A with ring above C9 00 90 Latin capital letter E with acute E6 00 91 ‘ Latin small letter ae C6 00 92 ’ Latin capital letter AE F4 00 93 “ Latin small letter o with circumflex F6 00 94 ” Latin small letter o with tilde F2 00 95 • Latin small letter o with grave FB 00 96 – Latin small letter u with circumflex F9 00 97 — Latin small letter u with grave FF 00 98 ˜ Latin small letter y with diaeresis D6 00 99 ™ Latin capital letter O with diaeresis DC 00 9A š Latin capital letter U with diaeresis A2 00 9B › cent sign A3 00 9C œ pound sign A5 00 9D currency sign A7 20 9E ž peseta sign 92 01 9F Ÿ Latin small letter f with hook E1 00 A0   Latin small letter a with acute ED 00 A1 ¡ Latin small letter i with acute F3 00 A2 ¢ Latin small letter o with acute FA 00 A3 £ Latin small letter u with acute F1 00 A4 ¤ Latin small letter n with tilde D1 00 A5 ¥ Latin capital letter N with tilde AA 00 A6 ¦ feminine ordinal indicator BA 00 A7 § masculine ordinal indicator BF 00 A8 ¨ inverted question mark 10 23 A9 © reversed not sign AC 00 AA ª not sign BD 00 AB « vulgar fraction one half BC 00 AC ¬ vulgar fraction one quarter A1 00 AD ­ inverted exclamation mark AB 00 AE ® left-pointing double angle quotation mark BB 00 AF ¯ right-pointing double angle quotation mark 91 25 B0 ° light shade 92 25 B1 ± medium shade 93 25 B2 ² dark shade 02 25 B3 ³ box drawings light vertical 24 25 B4 ´ box drawings light vertical and left 61 25 B5 µ box drawings vertical single and left double 62 25 B6 ¶ box drawings vertical double and left single 56 25 B7 · box drawings down double and left single 55 25 B8 ¸ box drawings down single and left double 63 25 B9 ¹ box drawings double vertical and left 51 25 BA º box drawings double vertical 57 25 BB » box drawings double down and left 5D 25 BC ¼ box drawings double up and left 5C 25 BD ½ box drawings up double and left single 5B 25 BE ¾ box drawings up single and left double 10 25 BF ¿ box drawings light down and left 14 25 C0 À box drawings light up and right 34 25 C1 Á box drawings light up and horizontal 2C 25 C2  box drawings light down and horizontal 1C 25 C3 à box drawings light vertical and right 00 25 C4 Ä box drawings light horizontal 3C 25 C5 Å box drawings light vertical and horizontal 5E 25 C6 Æ box drawings vertical single and right double 5F 25 C7 Ç box drawings vertical double and right single 5A 25 C8 È box drawings double up and right 54 25 C9 É box drawings double down and right 69 25 CA Ê box drawings double up and horizontal 66 25 CB Ë box drawings double down and horizontal 60 25 CC Ì box drawings double vertical and right 50 25 CD Í box drawings double horizontal 6C 25 CE Î box drawings double vertical and horizontal 67 25 CF Ï box drawings up single and horizontal double 68 25 D0 Ð box drawings up double and horizontal single 64 25 D1 Ñ box drawings down single and horizontal double 65 25 D2 Ò box drawings down double and horizontal single 59 25 D3 Ó box drawings up double and right single 58 25 D4 Ô box drawings up single and right double 52 25 D5 Õ box drawings down single and right double 53 25 D6 Ö box drawings down double and right single 6B 25 D7 × box drawings vertical double and horizontal single 6A 25 D8 Ø box drawings vertical single and horizontal double 18 25 D9 Ù box drawings light up and left 0C 25 DA Ú box drawings light down and right 88 25 DB Û full block 84 25 DC Ü lower half block 8C 25 DD Ý left half block 90 25 DE Þ right half block 80 25 DF ß upper half block B1 03 E0 à Greek small letter alpha DF 00 E1 á Latin small letter sharp s 93 03 E2 â Greek capital letter Gamma C0 03 E3 ã Greek small letter pi A3 03 E4 ä Greek capital letter Sigma C3 03 E5 å Greek small letter sigma B5 00 E6 æ micro sign C4 03 E7 ç Greek small letter tau A6 03 E8 è Greek capital letter Phi 98 03 E9 é Greek capital letter Theta A9 03 EA ê Greek capital letter Omega B4 03 EB ë Greek small letter beta 1E 22 EC ì infinity C6 03 ED í Greek small letter phi B5 03 EE î Greek small letter gamma 29 22 EF ï intersection 61 22 F0 ð identical to B1 00 F1 ñ plus-minus sign 65 22 F2 ò greater-than or equal to 64 22 F3 ó less-than or equal to 20 23 F4 ô top half integral 21 23 F5 õ bottom half integral F7 00 F6 ö Latin small letter o with diaeresis 48 22 F7 ÷ almost equal to B0 00 F8 ø degree sign 19 22 F9 ù bullet operator B7 00 FA ú middle dot 1A 22 FB û square root 7F 20 FC ü superscript Latin small letter n B2 00 FD ý superscript two A0 25 FE þ black square A0 00 FF ÿ no-break space ") } // // Go to end of first line and determine and set file type // (necessary to be safe under some circumstances) Begin_Of_File #103 = 0 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE,#103, LOCAL) // // Convert the translation table into binary. // Replace("|<|[|W]|N","",BEGIN+ALL+NOERR) // remove empty lines Replace("|<|W","",BEGIN+ALL+NOERR) // remove leading whitespace Begin_Of_File while ( ! At_EoF) { for (#95=1 ; #95<=3 ; #95++) { // three hex values per line are converted to binary #92=0 // one byte for (#103=1; #103>=0 ; #103--) { // two nibbles per byte, high to low if ( Cur_Char >= '0' && Cur_Char <= '9' ) { #105 = Cur_Char - '0' } else { #104=Cur_Char&0xDF if ( #104 >= 'A' && #104 <= 'F' ) { #105 = 10 + #104 - 'A' } else { if (Is_Quiet) { QALLY(#86) } else { Alert() Message("*****Bad input:\n") Type(0) Type_Newline() Type(1) Get_Key("Press any key to quit...") Num_Pop(90,98) return }}} #92 = #92 + (#105 << 4*#103) // Build the byte Del_Char(1) // Delete current nibble } Ins_Char(#92) // Insert byte if ( (#95<=2) && (Match("|X")==0) ) { Del_Char(Chars_Matched) } // Delete intermediate whitespace } Del_Block(Cur_Pos, EoL_Pos) // delete rest of line if ((#96==1)||(#97==1)) { // if big-endian required Begin_Of_Line // switch first two bytes #92=Cur_Char Del_Char(1) Char(1) Ins_Char(#92) } Line(1,NOERR+ERRBREAK) // Next character in table } File_Open("|(VEDIT_TEMP)\utf-ansi.err", NOMSG+NOEVENT) // Buffer for not translated characters (for debugging etc.) #98=Buf_Num Del_Block(0, File_Size) Config(F_F_TYPE, 0, LOCAL) Config(F_OVER_MODE, 0 , LOCAL) // Now translate the Unicode codepoints using the table above Buf_Switch(#90) // Back to the original text Begin_Of_File() Config(F_F_TYPE, 0, LOCAL) // set file type to disable possible overwrite-only mode Config(F_OVER_MODE, 0 , LOCAL) if (#96) { Del_Char(2) } // Delete optional BOM #121=0 // Warning flag for incomplete success of conversion // // T-Reg(104) holds the special character conversion routine (dependent on byte order) // (speed optimized by using abbreviated commands) // if ((#96==1)||(#97==1)) { // if big-endian order Reg_Set(104,` RCB(103,CP,CP+2) BS(#91) S("|@(103)",BEGIN+CASE+ADVANCE+NOERR) if (!EM) { #105=CC BS(#90) IC(#105,OVERWRITE) DC return } #121++ if (IsQuiet) { if ((#121*100/#88)>#84){QALLY(#85)} } BS(#98) RI(103) IN BS(#90) IC(127,OVERWRITE) DC `) } else { // little-endian order Reg_Set(104,` RCB(103,CP-1,CP+1) BS(#91) S("|@(103)",BEGIN+CASE+ADVANCE+NOERR) if (!EM) { #105=CC BS(#90) C(-1) IC(#105,OVERWRITE) return } #121++ if (IsQuiet) { if ((#121*100/#88)>#84){QALLY(#85)} } BS(#98) RI(103) IN BS(#90) C(-1) IC(127,OVERWRITE) `) } // // The main loop is speed optimized by using abbreviated commands // and no whitespace. (Harder to read, but faster.) // If Font is ANSI: // check every second byte // If it's not a |H00: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte // If Font is OEM: // check 8th bit of first byte and every second byte // If it's not a |H000: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte if ((#96==1)||(#97==1)) { // if big-endian file if (Font_Charset==0) { while (! At_EOF) {if(CC){call(104)}else{DC C}} } else { while (! At_EOF) {if(CC||(CC(-1)&0x80)){call(104)}else{DC C}} } } else { // if little-endian file if (Font_Charset==0) { while (! At_EOF) {C if(CC){call(104)} DC} } else { while (! At_EOF) {C if(CC||(CC(-1)&0x80)){call(104)} DC} } } Buf_Switch(#91) // working buffer with conversion table Buf_Quit(OK) // close it Buf_Switch(#98) // buffer with not translated characters if (File_Size > 0) { BoF Ins_Text("The following (UTF-16") if ((#96==1)||(#97==1)) { // if big-endian file Ins_Text("BE") } else { Ins_Text("LE") } Ins_Text(") characters couldn't be translated to ") Reg_Ins(105) Ins_Text(":") Ins_Newline(1) // convert the UTF-byte-pairs to hex... while (!At_EOF) { for (#95=1 ; #95<=2 ; #95++) { // 2 values per line are converted to hex #105 = (Cur_Char & 0xf0) >> 4 #106 = Cur_Char & 0xf if(#105 > 9) { #105 = 'A' + (#105 - 10) } else { #105 += '0' } Ins_Char(#105,OVERWRITE) if(#106 > 9) { #106 = 'A' + (#106 - 10) } else { #106 += '0' } Ins_Char(#106) Ins_Char(32) } Line(1) } File_Save(NOMSG) } else { File_Delete("|(VEDIT_TEMP)\utf-ansi.err", OK+NOERR) } Buf_Quit(OK) Buf_Switch(#90) // back to original buffer Num_Pop(90,98) // restore used registers #105 = File_Size #104 = #121*100/#105 // #104 = % (percentage) of unknown characters // // If run via "-x" invocation option, save file and exit. // Determine return code based on percentage of unknown characters // if (Is_Auto_Execution && Macro_Num == 100) { #103 = 0 if (#121>0) { if (#104>#81) { #103=#80 } if (#104>#82) { #103=#80+1 } if (#104>#83) { #103=#80+2 } } Xall(#103) } // // Go to end of first line and determine file type // Begin_Of_File #103 = 64 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE,#103, LOCAL) // // If file size < 1 meg, restore (approximate) cursor position. // if (File_Size < 1000000) { Goto_Pos(#100/2 -1) } // // If there were unknown Unicode characters, report how many and percentage // and optionally show them. // if (#121) { Update() Num_Str(#121, 103, LEFT) Num_Str(#104, 106, LEFT) if (#104 < 1) { Reg_Set(106, "< 1") } #103=Dialog_Input_1(104, "`ERROR - Unicode to |@(105)`, `This file contained |@(103) Unicode character(s) (|@(106)%) \nthat could not be translated to |@(105).`, `[OK]`,`[Show characters]`", APP+CENTER,0,0) if (#103==2) { File_Open("|(VEDIT_TEMP)\utf-ansi.err") } } // Done.