// ANSI-UTF.VDM - ANSI/OEM to Unicode (UTF-16) conversion of entire file // // By: Ian Binnie and Christian Ziemski // Last Change: 02-Mar-2009. // // Edit usage: {EDIT, Translate, ASCII to Unicode}. // // Filter usage: vpw -w -s0 -q ansi-utf.vdm pathname // //---------------------------------------------------------------------------------------- // // Description: Translates 8-bit ASCII to UTF-16 or UTF-8 Unicode. // Assumes the ASCII has the same character set ANSI (Code Page 1252) or // OEM (Code Page 437) as the current VEDIT font. // Characters for which there is no mapping (0x7F, 0x81, 0x8D, 0x8F, 0x90, 0x9D) // appear unchanged as pseudo UTF characters. // // See http://www.alanwood.net/demos/ansi.html // // Requires: VEDIT for Windows 6.13 or later. // //---------------------------------------------------------------------------------------- // // #121 Convert to: 1=UTF-16LE, 2=UTF-8 // #122 Option: 1=insert BOM // #123 Option: 1=convert &#num; into Unicode // //---------------------------------------------------------------------------------------- if (Font_Charset == 0) { Reg_Set(105,"ANSI") } else { Reg_Set(105,"OEM") } #106 = Cur_Pos // #106 current position // Check that the file is not already UTF. #121 = 0 Begin_Of_File if (Match("|HFE|HFF")==0) { #121=1 } // big-endian BOM found if (Match("|HFF|HFE")==0) { #121=2 } // little-endian BOM found if (Match("|hEF|hBB|hBF")==0) { #121=3 } // UTF-8 BOM found if (#121) { // Error if UTF BOM found Alert() if (Is_Quiet) { Return } #104 = Dialog_Input_1(123,"`ERROR - |@(105) to Unicode`, `This file appears to already be Unicode\nand will not be converted.`",APP+CENTER,0,0) Goto_Pos(#106) Return } // Show main dialog unless run via "-x" invocation option. #121=1 // "Convert to" radio button #122=1 // insert BOM #123=0 // convert HTML codes if (!(Is_Auto_Execution && Macro_Num == 100)) { // Check for HTML numeric codes if (Search("&#[0-9]+;", REGEXP+NOERR+LOCAL)) { #123 = 1 } if (OS_TYPE==1) { #104 = Dialog_Input_1(121,"`|@(105) to Unicode`, `OK to translate entire file from |@(105) to Unicode?`, `Convert to: `, `.g.h.l()UTF-16, &Little Endian `, `()UTF-&8`, `[]Insert &Byte Order Marker`, `[]Convert &HTML numeric codes into Unicode`, `[&OK]`,`[Cancel]`",APP+CENTER,0,0) if (#104 != 1) { Goto_Pos(#106) Return } } } // preparation for the translation Num_Push(90,95) #90=Buf_Num // current buffer #91=Buf_Switch(Buf_Free) // working buffer for translation table // The translation tables for characters corresponding to ANSI 0x80 - 0x9F // or VEDIT OEM 0x80 - 0xFF // One line per character: // Two hex bytes for the target UTF-16 character // One hex byte for the source ANSI/OEM character // An optional description if (Font_Charset == 0) { // ANSI Charset - Code Page 1252 Ins_Text(" AC 20 80 EURO SIGN 1A 20 82 SINGLE LOW-9 QUOTATION MARK 92 01 83 LATIN SMALL LETTER F WITH HOOK 1E 20 84 DOUBLE LOW-9 QUOTATION MARK 26 20 85 HORIZONTAL ELLIPSIS 20 20 86 DAGGER 21 20 87 DOUBLE DAGGER C6 02 88 MODIFIER LETTER CIRCUMFLEX ACCENT 30 20 89 PER MILLE SIGN 60 01 8A LATIN CAPITAL LETTER S WITH CARON 39 20 8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK 52 01 8C LATIN CAPITAL LIGATURE OE 7D 01 8E LATIN CAPITAL LETTER Z WITH CARON 18 20 91 LEFT SINGLE QUOTATION MARK 19 20 92 RIGHT SINGLE QUOTATION MARK 1C 20 93 LEFT DOUBLE QUOTATION MARK 1D 20 94 RIGHT DOUBLE QUOTATION MARK 22 20 95 BULLET 13 20 96 EN DASH 14 20 97 EM DASH DC 02 98 SMALL TILDE 22 21 99 TRADE MARK SIGN 61 01 9A LATIN SMALL LETTER S WITH CARON 3A 20 9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 53 01 9C LATIN SMALL LIGATURE OE 7E 01 9E LATIN SMALL LETTER Z WITH CARON 78 01 9F LATIN CAPITAL LETTER Y WITH DIAERESIS ") } else { // OEM Charset CodePage 437 Ins_Text(" C7 00 80 Latin capital letter C with cedilla FC 00 81 Latin small letter u with diaeresis E9 00 82 Latin small letter e with acute E2 00 83 Latin small letter a with circumflex E4 00 84 Latin small letter a with diaeresis E0 00 85 Latin small letter a with grave E5 00 86 Latin small letter a with ring above E7 00 87 Latin small letter c with cedilla EA 00 88 Latin small letter e with circumflex EB 00 89 Latin small letter e with diaeresis E8 00 8A Latin small letter e with grave EF 00 8B Latin small letter i with diaeresis EE 00 8C Latin small letter i with circumflex EC 00 8D Latin small letter i with grave C4 00 8E Latin capital letter A with diaeresis C5 00 8F Latin capital letter A with ring above C9 00 90 Latin capital letter E with acute E6 00 91 Latin small letter ae C6 00 92 Latin capital letter AE F4 00 93 Latin small letter o with circumflex F6 00 94 Latin small letter o with tilde F2 00 95 Latin small letter o with grave FB 00 96 Latin small letter u with circumflex F9 00 97 Latin small letter u with grave FF 00 98 Latin small letter y with diaeresis D6 00 99 Latin capital letter O with diaeresis DC 00 9A Latin capital letter U with diaeresis A2 00 9B cent sign A3 00 9C pound sign A5 00 9D currency sign A7 20 9E peseta sign 92 01 9F Latin small letter f with hook E1 00 A0 Latin small letter a with acute ED 00 A1 Latin small letter i with acute F3 00 A2 Latin small letter o with acute FA 00 A3 Latin small letter u with acute F1 00 A4 Latin small letter n with tilde D1 00 A5 Latin capital letter N with tilde AA 00 A6 feminine ordinal indicator BA 00 A7 masculine ordinal indicator BF 00 A8 inverted question mark 10 23 A9 reversed not sign AC 00 AA not sign BD 00 AB vulgar fraction one half BC 00 AC vulgar fraction one quarter A1 00 AD inverted exclamation mark AB 00 AE left-pointing double angle quotation mark BB 00 AF right-pointing double angle quotation mark 91 25 B0 light shade 92 25 B1 medium shade 93 25 B2 dark shade 02 25 B3 box drawings light vertical 24 25 B4 box drawings light vertical and left 61 25 B5 box drawings vertical single and left double 62 25 B6 box drawings vertical double and left single 56 25 B7 box drawings down double and left single 55 25 B8 box drawings down single and left double 63 25 B9 box drawings double vertical and left 51 25 BA box drawings double vertical 57 25 BB box drawings double down and left 5D 25 BC box drawings double up and left 5C 25 BD box drawings up double and left single 5B 25 BE box drawings up single and left double 10 25 BF box drawings light down and left 14 25 C0 box drawings light up and right 34 25 C1 box drawings light up and horizontal 2C 25 C2 box drawings light down and horizontal 1C 25 C3 box drawings light vertical and right 00 25 C4 box drawings light horizontal 3C 25 C5 box drawings light vertical and horizontal 5E 25 C6 box drawings vertical single and right double 5F 25 C7 box drawings vertical double and right single 5A 25 C8 box drawings double up and right 54 25 C9 box drawings double down and right 69 25 CA box drawings double up and horizontal 66 25 CB box drawings double down and horizontal 60 25 CC box drawings double vertical and right 50 25 CD box drawings double horizontal 6C 25 CE box drawings double vertical and horizontal 67 25 CF box drawings up single and horizontal double 68 25 D0 box drawings up double and horizontal single 64 25 D1 box drawings down single and horizontal double 65 25 D2 box drawings down double and horizontal single 59 25 D3 box drawings up double and right single 58 25 D4 box drawings up single and right double 52 25 D5 box drawings down single and right double 53 25 D6 box drawings down double and right single 6B 25 D7 box drawings vertical double and horizontal single 6A 25 D8 box drawings vertical single and horizontal double 18 25 D9 box drawings light up and left 0C 25 DA box drawings light down and right 88 25 DB full block 84 25 DC lower half block 8C 25 DD left half block 90 25 DE right half block 80 25 DF upper half block B1 03 E0 Greek small letter alpha DF 00 E1 Latin small letter sharp s 93 03 E2 Greek capital letter Gamma C0 03 E3 Greek small letter pi A3 03 E4 Greek capital letter Sigma C3 03 E5 Greek small letter sigma B5 00 E6 micro sign C4 03 E7 Greek small letter tau A6 03 E8 Greek capital letter Phi 98 03 E9 Greek capital letter Theta A9 03 EA Greek capital letter Omega B4 03 EB Greek small letter beta 1E 22 EC infinity C6 03 ED Greek small letter phi B5 03 EE Greek small letter gamma 29 22 EF intersection 61 22 F0 identical to B1 00 F1 plus-minus sign 65 22 F2 greater-than or equal to 64 22 F3 less-than or equal to 20 23 F4 top half integral 21 23 F5 bottom half integral F7 00 F6 Latin small letter o with diaeresis 48 22 F7 almost equal to B0 00 F8 degree sign 19 22 F9 bullet operator B7 00 FA middle dot 1A 22 FB square root 7F 20 FC superscript Latin small letter n B2 00 FD superscript two A0 25 FE black square A0 00 FF no-break space ") } // To be save (otherwise macro may fail): // Go to end of first line and determine file type Begin_Of_File() #103 = 0 if (Search("|{|H0D,|H0A}",NOERR)) { if (Match("|H0D|H0A")==0) { #103 = 0 } else { if (Match("|H0A")==0) { #103 = 1 } else { if (Match("|H0D")==0) { #103 = 2 } } } } Config(F_F_TYPE, #103, LOCAL) // // The translation table has to be converted from text to binary format now // Replace("|<|[|W]|N","",BEGIN+ALL+NOERR) // remove empty lines Replace("|<|W","",BEGIN+ALL+NOERR) // remove leading whitespace BoF while ( ! At_EoF) { for (#95=1 ; #95<=3 ; #95++) { // three hex values per line are converted to binary #92=0 // one byte for (#103=1; #103>=0 ; #103--) { // two nibbles per byte, high to low if ( Cur_Char >= '0' && Cur_Char <= '9' ) { #105 = Cur_Char - '0' } else { #104=Cur_Char&0HDF if ( #104 >= 'A' && #104 <= 'F' ) { #105 = 10 + #104 - 'A' } else { Alert() Message("*****Bad input:\n") Type(0) Type_Newline() Type(1) Get_Key("Press any key to quit...") Num_Pop(90,95) return } } #92 = #92 + (#105 << 4*#103) // build the byte Del_Char(1) // delete current nibble } Ins_Char(#92) // insert byte if ( (#95<=2) && (Match("|X")==0) ) { Del_Char(Chars_Matched) } // delete intermediate whitespace } Del_Block(Cur_Pos, EoL_Pos) // delete rest of line Line(1,NOERR+ERRBREAK) // next character } // Now translate to Unicode codepoints using the table above Buf_Switch(#90) // back to the original text Config(F_F_TYPE,0, LOCAL) // set file type to disable possible overwrite-only mode Begin_Of_File() if (#122) { // insert BOM? Ins_Char(255) Ins_Char(254) // UTF-16LE BOM } // T-Reg(104) holds the special character conversion routine // (speed optimized by using abbreviated commands) // Reg_Set(104,` RCB(103,Cur_Pos,Cur_Pos+1) BS(#91) S("|@(103)|>",BEGIN+CASE+NOERR) if (Error_Match) { BS(#90) C IC(0) } else { RCB(103,CP-2,CP) BS(#90) DC(1) RI(103) } `) // T-Reg(106) holds the routine to convert HTML numeric codes into Unicode // Numeric codes are in format &#ddd; where ddd is 1 to n decimal digits. // Does not convert hex values nor HTML entities. // Reg_Set(106,` while(Match("&#[0-9]+;",REGEXP)==0) { DC(2) #93=NE(SIMPLE) DC(CMAT+1) IC(#93&255) IC(#93>>8) } `) // // The main loop // (speed optimized by using abbreviated commands and no whitespace) // if (#123) { // Convert &#num; into Unicode option selected if (Font_Charset==0) { // ANSI while (! At_EOF) { if(CC=='&'){call(106)} if(CC<0x80||CC>0x9F){C IC(0)}else{call(104)} } } else { // OEM while (! At_EOF) { if(CC=='&'){call(106)} if(CC<0x80){C IC(0)}else{call(104)} } } } else { // do not convert &#num; if (Font_Charset==0) { // ANSI while (! At_EOF) {if(CC<0x80||CC>0x9F){C IC(0)}else{call(104)}} } else { // OEM while (! At_EOF) {if(CC<0x80){C IC(0)}else{call(104)}} } } Buf_Switch(#91) // working buffer with conversion table Buf_Quit(OK) // close it Buf_Switch(#90) // back to original buffer // If UTF-8 selected, convert UTF-16 to UTF-8 if (#121 == 2) { Call("UTF16_TO_UTF8") } Num_Pop(90,95) // If run via "-x" invocation option, save file and exit. // if (Is_Auto_Execution && Macro_Num == 100) { Xall } if (#121 < 3) { // not UTF-8 // File type of "1" works best with UTF-16 Config(F_F_TYPE, 1, LOCAL) } // If file size < 1 meg, restore (approximate) cursor position. if (File_Size < 1000000) { if (#121 < 2) { #106 *= 2 } // UTF-16 Goto_Pos(#106 + #122*2) } Return ///////////////////////////////////////////////////////////////// // // Convert UTF-16LE into UTF-8 :UTF16_TO_UTF8: BOF while ( ! At_EOF) { #44=cur_char(1) * 256 + cur_char() //convert CP+1 into short if(#44<0x80) // 1 byte character { Char Del_Char(1) } else // multi byte sequence... { #41 = #44 & 0x3F | 0x80 // lsd if(#44<0x800) // 2 byte sequence { #42 = (#44 >> 6) & 0x1F | 0xC0 } else { #42 = (#44 >> 6) & 0x3F | 0x80 if(#44 < 0x10000) // 3 byte sequence { #43 = (#44 >> 12) & 0x0F | 0xE0 } else // 4 byte sequence { #43 = (#44 >> 12) & 0x3F | 0x80 Ins_Char((#44 >> 14) & 0x07 | 0xf0) // msd } Ins_Char(#43) } Ins_Char(#42, OVERWRITE) Ins_Char(#41, OVERWRITE) } } Return