// UTF-ANSI.VDM - Unicode (UTF-16) to ANSI/OEM conversion of entire file // // by Ian Binnie and Christian Ziemski // ian_binnie at optusnet dot com dot au ( replace "at" "dot" by the normal characters.) // // Edit usage: {Misc, Load/execute user macro, utf-ansi.vdm} // // Filter usage: vpw -w -s0 -q utf-ansi.vdm pathname // // 31 May 2004 by Ian Binnie based on unic-asc.vdm by Christian Ziemski // // modified: 24 Jun 2004 by Christian Ziemski Completely new technique (idea by Ted). Single pass now! // Some fixes regarding register usage etc. // Added counter for not translated unicode characters // // modified: 25 Jun 2004 by Christian Ziemski Added choice to force translation, even if it seems to be no Unicode file. // New format for the translation table: easier to maintain. // Removed the prompts for DOS VEDIT // Added another translation table: OEM // // modified: 06 Jul 2004 by Thomas C. Burt "Quiet" mode and error codes for .BAT processing // // modified: 12 Jul 2004 by Christian Ziemski Reimplemented checking for BOM // Added returncode 12 if in non-interactive mode and not an UTF-16 file // Added translation from big-endian UTF-16 // last change: 13 Jul 2004 by Christian Ziemski fixed translation: Search(...,CASE) // // To do: How to guess endianess when no BOM? // What about line-end != CR/LF when checking UTF format? // //---------------------------------------------------------------------------------------- // // Description: This macro converts UTF-16 to ANSI (Windows Code Page 1252) // or to VEDIT OEM. // Converts Microsoft Unicode (UTF-16LE) (little-endian) files // and UTF-16BE (big-endian) files. // If the Unicode file contains characters for which there is no mapping // the result is undefined, but an error is indicated. // // See: http://www.alanwood.net/demos/ansi.html // Or: ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT // // Requires: VEDIT for Windows 6.02 or later. // //---------------------------------------------------------------------------------------- // // Numeric Register Usage: // // Return (error) code (invocation parameter "-x" and/or "-q"): // // #80 Exit code for small # of errors (default is 0; edit to 1, if desired; // all later return codes are > #80 // #81 Exit level #80 if < #81% errors // #82 Exit level #80+1 if < #82% errors // #83 Exit level #80+2 if < #83% errors // #84 Quit, returning error level #85 if # errors ever > #84% // (Only used for "-q" quiet mode) // #85 Error Level for too many errors (10; see below) // #86 Corrupted translation tables error level; quit with error level 11 // #88 File-size of ANSI file (1/2 of input file) // // Other usage: (Regs 90-95 are pushed/popped) // // #90 ID of Current Buffer // #91 ID of Translation Work Buffer // #92,#95 Compiling the Translation Table from its text source herein // #96 flag for endianess via BOM // #97 flag for endianess // #103-#105 // #106 Edit position on entry // #121 Warning count/flag #106 = Cur_Pos // current position // // For empty file, do nothing. // BoF if (At_EoF){ if (Is_Quiet) { Exit(0) } else { return } } // // Error code levels for auto-execution mode. // // Note: generally, there will be no interaction with other macros when run // "auto-execution"; thus, there is no need to save numeric registers. // if (Is_Auto_Execution && Macro_Num == 100) { #80 = 0 // Default error level for "just a few" number of unknown chars; // set to 1 for "no errors allowed"; // higher error levels are based on this value #81 = 2 // Percentage for "just a few" number of errors #82 = 10 // 10% #83 = 15 // 15% #84 = 33 // Percentage to "abort" the translation process at the // instance it occurs. ("Quits" the conversion, // returning error level #85). #85 = 10 // Error level for "too many errors" #86 = 11 // Error level for corrupted translation table #88 = File_Size >> 1 // Size of target file = 1/2 input file } // // Determine whether ANSI or OEM character mode. // if (Font_Charset == 0) { Reg_Set(105,"ANSI") } else { Reg_Set(105,"OEM") } // // Determine the file's byte-order (endianess) if possible // - via (optional) BOM // Else check that this is possibly a Unicode UTF-16 file // - at least 5 lines with UTF-16LE CR/LF // Begin_Of_File #104=0 // endianess to guess if (match("|HFE|HFF")==0) { // big-endian BOM found #103=1 } else { if (match("|HFF|HFE")==0) { // little-endian BOM found #103=2 } else { // no UTF BOM found (This is no error: Unicode does not require BOM) #103=0 // No BOM #104=2 // so assume little-endian (usually used on x86 architecture) // To do: try to guess the endianess // To do: what with UNIX line ends (only LF) etc.? for (#105=1; #105<6; #105++) { // Check at least 5 lines with UTF-16 CR/LF Search("|H0D",NOERR+ERRBREAK) if(Match("|H0D|H00|H0A|H00", ADVANCE)!=0) { // not CR/LF in little-endian format break } } if(#105<6) { // if not at least 5/6 lines with correct line ends if (Is_Quiet) { Exit(12) // exit macro with return code 12 } else { Alert() #105 = Dialog_Input_1(121,"`ERROR - Unicode UTF-16 to |@(105)`, `This file seems to be either not Unicode\nor is a Unicode file that is not supported by VEDIT.`, `But you can force the translation anyway, with unknown results.`, ` Force LE\tassumes a byte order used on x86 architectures etc.\n Force BE\tassumes a byte order used on many RISC architectures etc.`, `[&Quit]`,`[&Force LE]`,`[&Force BE]`",APP+CENTER,0,0) if (#105 < 2 ) { // Quit or Esc Goto_Pos(#106) Return } if (#105 == 2) { // Force little-endian #104=2 } if (#105 == 3) { // Force big-endian #104=1 } } } } } // // Give confirmation prompt unless run via "-x" invocation option. // if (!(Is_Auto_Execution && Macro_Num == 100)) { Alert() #105 = Dialog_Input_1(121,"`Confirmation`,`OK to translate entire file from Unicode to |@(105)?`,`[&Yes]`,`[&No]`",APP+CENTER,0,0) if (#105 != 1) { Goto_Pos(#106) Return } } // // Prepare for the translation. // Num_Push(90,97) #96=#103 // endianess via BOM, else 0 #97=#104 // guessed endianess if no BOM, else 0 #90=Buf_Num // current buffer #91=Buf_Switch(Buf_Free) // working buffer for translation table // The translation tables for characters corresponding to ANSI 0x80 - 0x9F // or VEDIT OEM 0x80 - 0xFF // One line per character: // Two hex bytes for the source UTF-16 character in little-endian order // One hex byte for the target ANSI/OEM character // An optional description if (Font_Charset == 0) { // ANSI Charset - Code Page 1252 Ins_Text(" AC 20 80 € EURO SIGN 1A 20 82 ‚ SINGLE LOW-9 QUOTATION MARK 92 01 83 ƒ LATIN SMALL LETTER F WITH HOOK 1E 20 84 „ DOUBLE LOW-9 QUOTATION MARK 26 20 85 … HORIZONTAL ELLIPSIS 20 20 86 † DAGGER 21 20 87 ‡ DOUBLE DAGGER C6 02 88 ˆ MODIFIER LETTER CIRCUMFLEX ACCENT 30 20 89 ‰ PER MILLE SIGN 60 01 8A Š LATIN CAPITAL LETTER S WITH CARON 39 20 8B ‹ SINGLE LEFT-POINTING ANGLE QUOTATION MARK 52 01 8C Œ LATIN CAPITAL LIGATURE OE 7D 01 8E Ž LATIN CAPITAL LETTER Z WITH CARON 18 20 91 ‘ LEFT SINGLE QUOTATION MARK 19 20 92 ’ RIGHT SINGLE QUOTATION MARK 1C 20 93 “ LEFT DOUBLE QUOTATION MARK 1D 20 94 ” RIGHT DOUBLE QUOTATION MARK 22 20 95 • BULLET 13 20 96 – EN DASH 14 20 97 — EM DASH DC 02 98 ˜ SMALL TILDE 22 21 99 ™ TRADE MARK SIGN 61 01 9A š LATIN SMALL LETTER S WITH CARON 3A 20 9B › SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 53 01 9C œ LATIN SMALL LIGATURE OE 7E 01 9E ž LATIN SMALL LETTER Z WITH CARON 78 01 9F Ÿ LATIN CAPITAL LETTER Y WITH DIAERESIS ") } else { // OEM Charset CodePage 437 Ins_Text(" C7 00 80 € Latin capital letter C with cedilla FC 00 81 Latin small letter u with diaeresis E9 00 82 ‚ Latin small letter e with acute E2 00 83 ƒ Latin small letter a with circumflex E4 00 84 „ Latin small letter a with diaeresis E0 00 85 … Latin small letter a with grave E5 00 86 † Latin small letter a with ring above E7 00 87 ‡ Latin small letter c with cedilla EA 00 88 ˆ Latin small letter e with circumflex EB 00 89 ‰ Latin small letter e with diaeresis E8 00 8A Š Latin small letter e with grave EF 00 8B ‹ Latin small letter i with diaeresis EE 00 8C Œ Latin small letter i with circumflex EC 00 8D Latin small letter i with grave C4 00 8E Ž Latin capital letter A with diaeresis C5 00 8F Latin capital letter A with ring above C9 00 90 Latin capital letter E with acute E6 00 91 ‘ Latin small letter ae C6 00 92 ’ Latin capital letter AE F4 00 93 “ Latin small letter o with circumflex F6 00 94 ” Latin small letter o with tilde F2 00 95 • Latin small letter o with grave FB 00 96 – Latin small letter u with circumflex F9 00 97 — Latin small letter u with grave FF 00 98 ˜ Latin small letter y with diaeresis D6 00 99 ™ Latin capital letter O with diaeresis DC 00 9A š Latin capital letter U with diaeresis A2 00 9B › cent sign A3 00 9C œ pound sign A5 00 9D currency sign A7 20 9E ž peseta sign 92 01 9F Ÿ Latin small letter f with hook E1 00 A0   Latin small letter a with acute ED 00 A1 ¡ Latin small letter i with acute F3 00 A2 ¢ Latin small letter o with acute FA 00 A3 £ Latin small letter u with acute F1 00 A4 ¤ Latin small letter n with tilde D1 00 A5 ¥ Latin capital letter N with tilde AA 00 A6 ¦ feminine ordinal indicator BA 00 A7 § masculine ordinal indicator BF 00 A8 ¨ inverted question mark 10 23 A9 © reversed not sign AC 00 AA ª not sign BD 00 AB « vulgar fraction one half BC 00 AC ¬ vulgar fraction one quarter A1 00 AD ­ inverted exclamation mark AB 00 AE ® left-pointing double angle quotation mark BB 00 AF ¯ right-pointing double angle quotation mark 91 25 B0 ° light shade 92 25 B1 ± medium shade 93 25 B2 ² dark shade 02 25 B3 ³ box drawings light vertical 24 25 B4 ´ box drawings light vertical and left 61 25 B5 µ box drawings vertical single and left double 62 25 B6 ¶ box drawings vertical double and left single 56 25 B7 · box drawings down double and left single 55 25 B8 ¸ box drawings down single and left double 63 25 B9 ¹ box drawings double vertical and left 51 25 BA º box drawings double vertical 57 25 BB » box drawings double down and left 5D 25 BC ¼ box drawings double up and left 5C 25 BD ½ box drawings up double and left single 5B 25 BE ¾ box drawings up single and left double 10 25 BF ¿ box drawings light down and left 14 25 C0 À box drawings light up and right 34 25 C1 Á box drawings light up and horizontal 2C 25 C2  box drawings light down and horizontal 1C 25 C3 à box drawings light vertical and right 00 25 C4 Ä box drawings light horizontal 3C 25 C5 Å box drawings light vertical and horizontal 5E 25 C6 Æ box drawings vertical single and right double 5F 25 C7 Ç box drawings vertical double and right single 5A 25 C8 È box drawings double up and right 54 25 C9 É box drawings double down and right 69 25 CA Ê box drawings double up and horizontal 66 25 CB Ë box drawings double down and horizontal 60 25 CC Ì box drawings double vertical and right 50 25 CD Í box drawings double horizontal 6C 25 CE Î box drawings double vertical and horizontal 67 25 CF Ï box drawings up single and horizontal double 68 25 D0 Ð box drawings up double and horizontal single 64 25 D1 Ñ box drawings down single and horizontal double 65 25 D2 Ò box drawings down double and horizontal single 59 25 D3 Ó box drawings up double and right single 58 25 D4 Ô box drawings up single and right double 52 25 D5 Õ box drawings down single and right double 53 25 D6 Ö box drawings down double and right single 6B 25 D7 × box drawings vertical double and horizontal single 6A 25 D8 Ø box drawings vertical single and horizontal double 18 25 D9 Ù box drawings light up and left 0C 25 DA Ú box drawings light down and right 88 25 DB Û full block 84 25 DC Ü lower half block 8C 25 DD Ý left half block 90 25 DE Þ right half block 80 25 DF ß upper half block B1 03 E0 à Greek small letter alpha DF 00 E1 á Latin small letter sharp s 93 03 E2 â Greek capital letter Gamma C0 03 E3 ã Greek small letter pi A3 03 E4 ä Greek capital letter Sigma C3 03 E5 å Greek small letter sigma B5 00 E6 æ micro sign C4 03 E7 ç Greek small letter tau A6 03 E8 è Greek capital letter Phi 98 03 E9 é Greek capital letter Theta A9 03 EA ê Greek capital letter Omega B4 03 EB ë Greek small letter beta 1E 22 EC ì infinity C6 03 ED í Greek small letter phi B5 03 EE î Greek small letter gamma 29 22 EF ï intersection 61 22 F0 ð identical to B1 00 F1 ñ plus-minus sign 65 22 F2 ò greater-than or equal to 64 22 F3 ó less-than or equal to 20 23 F4 ô top half integral 21 23 F5 õ bottom half integral F7 00 F6 ö Latin small letter o with diaeresis 48 22 F7 ÷ almost equal to B0 00 F8 ø degree sign 19 22 F9 ù bullet operator B7 00 FA ú middle dot 1A 22 FB û square root 7F 20 FC ü superscript Latin small letter n B2 00 FD ý superscript two A0 25 FE þ black square A0 00 FF ÿ no-break space ") } // // To be safe (otherwise table conversion below may fail): // Go to end of first line and determine file type. // Begin_Of_File() #103 = 0 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE,#103, LOCAL) // // Convert the translation table into binary. // Replace("|<|[|W]|N","",BEGIN+ALL+NOERR) // remove empty lines Replace("|<|W","",BEGIN+ALL+NOERR) // remove leading whitespace BoF while ( ! At_EoF) { for (#95=1 ; #95<=3 ; #95++) { // three hex values per line are converted to binary #92=0 // one byte for (#103=1; #103>=0 ; #103--) { // two nibbles per byte, high to low if ( Cur_Char >= '0' && Cur_Char <= '9' ) { #105 = Cur_Char - '0' } else { #104=Cur_Char&0xDF if ( #104 >= 'A' && #104 <= 'F' ) { #105 = 10 + #104 - 'A' } else { if (Is_Quiet) { QALLY(#86) } else { Alert() Message("*****Bad input:\n") Type(0) Type_Newline() Type(1) Get_Key("Press any key to quit...") Num_Pop(90,97) return }}} #92 = #92 + (#105 << 4*#103) // Build the byte Del_Char(1) // Delete current nibble } Ins_Char(#92) // Insert byte if ( (#95<=2) && (Match("|X")==0) ) { Del_Char(Chars_Matched) } // Delete intermediate whitespace } Del_Block(Cur_Pos, EoL_Pos) // delete rest of line if ((#96==1)||(#97==1)) { // if big-endian required BOL // switch first two bytes #92=Cur_Char Del_Char(1) Char(1) Ins_Char(#92) } Line(1,NOERR+ERRBREAK) // Next character } // Now translate the Unicode codepoints using the table above Buf_Switch(#90) // Back to the original text Begin_Of_File() Config(F_F_TYPE,0, LOCAL) // set file type to disable possible overwrite-only mode if (#96) { Del_Char(2) } // Delete optional BOM #121=0 // Warning flag for incomplete success of conversion // // T-Reg(104) holds the special character conversion routine (dependent on byte order) // (speed optimized by using abbreviated commands) // if ((#96==1)||(#97==1)) { // if big-endian order Reg_Set(104,` RCB(103,CP,CP+2) BS(#91) S("|@(103)",BEGIN+CASE+ADVANCE+NOERR) if (!EM) { #105=CC BS(#90) IC(#105,OVERWRITE) DC return } #121++ if (IsQuiet) { if ((#121*100/#88)>#84){QALLY(#85)} } BS(#90) IC(127,OVERWRITE) DC `) } else { // little-endian order Reg_Set(104,` RCB(103,CP-1,CP+1) BS(#91) S("|@(103)",BEGIN+CASE+ADVANCE+NOERR) if (!EM) { #105=CC BS(#90) C(-1) IC(#105,OVERWRITE) return } #121++ if (IsQuiet) { if ((#121*100/#88)>#84){QALLY(#85)} } BS(#90) C(-1) IC(127,OVERWRITE) `) } // // The main loop is speed optimized by using abbreviated commands // and no whitespace. (Harder to read, but faster.) // If Font is ANSI: // check every second byte // If it's not a |H00: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte // If Font is OEM: // check 8th bit of first byte and every second byte // If it's not a |H000: check full Unicode character via conversion table // if found there: convert // else increment #121 as warning counter // In any case, then delete every second byte if ((#96==1)||(#97==1)) { // if big-endian file if (Font_Charset==0) { while (! At_EOF) {if(CC){call(104)}else{DC C}} } else { while (! At_EOF) {if(CC||(CC(-1)&0x80)){call(104)}else{DC C}} } } else { // if little-endian file if (Font_Charset==0) { while (! At_EOF) {C if(CC){call(104)} DC} } else { while (! At_EOF) {C if(CC||(CC(-1)&0x80)){call(104)} DC} } } Buf_Switch(#91) // working buffer with conversion table Buf_Quit(OK) // close it Buf_Switch(#90) // back to original buffer Num_Pop(90,97) #105 = File_Size #104 = #121*100/#105 //#104 = % (percentage) of unknown characters // // If run via "-x" invocation option, save file and exit. // Determine return code based on percentage of unknown characters // if (Is_Auto_Execution && Macro_Num == 100) { #103 = 0 if (#121>0) { if (#104>#81) { #103=#80 } if (#104>#82) { #103=#80+1 } if (#104>#83) { #103=#80+2 } } Xall(#103) } // // Go to end of first line and determine file type // Begin_Of_File() #103 = 64 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE,#103, LOCAL) // // If file size < 1 meg, restore (approximate) cursor position. // if (File_Size < 1000000) { Goto_Pos(#106/2 -1) } // // If unknown Unicode characters, report how many and percentage // if (#121) { Update() Num_Str(#121,103,LEFT) Num_Str(#104,106,LEFT) Dialog_Input_1(104,"`ERROR - Unicode to |@(105)`, `This file contained |@(103) Unicode character(s) (|@(106)%) \nthat could not be translated to |@(105).`", APP+CENTER,0,0) }