// // UTF-8.vdm UTF-8 <-> ISO8859 translator Ch.Ziemski // 22.07.2005 // 24.07.2005 // Zeichen Bitkomb. ISO-10646 Darstellung in UTF-8 //----------------------------------------------------------------------- // kleines "ä" 00000000 11100100 11000011 10100100 // E4 C3 A4 // 228 195 164 //äöü #90=Buf_Num File_Open("|(USER_MACRO)\UTF-8.dat") #91=Buf_Num Buf_Switch(#90) Reg_Set(95, " ? ") repeat (ALL) { #103=Dialog_Input_1(104,"`UTF-8 Translation`, `This ISO <-> UTF-8 translator is still very experimental! So use with care!`, `For now it only supports German Umlauts, but can be enhanced by filling the table UTF-8.dat with more values.`, ``, `The button [Table] below will help with that. But for now it's not ready to be executed.`, ` `, `(Hmm, what about codepages and fonts?)`, ` `, ` `, `The File seems to be: |@(95)`, `[Check]`,`[->UTF-8]`,`[->ISO]`,`[Table]`,`[Cancel]`",SET+APP+CENTER,0,0) if (#103==0 || #103==5) { break } if (#103==1) { Call("CHECK_TYPE") } if (#103==2) { Call("ISO_UTF8") } if (#103==3) { Call("UTF8_ISO") } if (#103==4) { Call("TABLE") } Buf_Switch(#90) Update } return //---------------------------------------------------------- :CHECK_TYPE: // to do: more reliable!!! Buf_Switch(#90) // the original text BoF Reg_Set(95, "ISO ") Search("|G", NOERR) if (Error_Match) { Reg_Set(95, "7-Bit ASCII") return } #103=Cur_Char #104=Cur_Char(1) if ( ((#103 & 0xE0) == 0xC0 ) && ((#104 & 0xC0) == 0x80) ) { // 110x xxxx 10xx xxxx Reg_Set(95, "UTF-8") } return //---------------------------------------------------------- :TABLE: Dialog_Input_1(104,"`UTF-8 Translation`, `Code not yet ready to be run ...`, `[&Ok]`",SET+APP+CENTER,0,0) return // 0. Check translation table // for binary contents, if not there: try to check hex and create binary Buf_Switch(#91) // the table Replace("|W|>","", BEGIN+ALL+NOERR) // remove trailing whitespace BoF repeat(ALL) { Search("|<|G", ADVANCE+NOERR+ERRBREAK) // search for all 8-bit character at BoL if (Match("|G|!|G")==0) { // incomplete UTF-8 code! Error! // error } #103=0 // flag for existence of binary UTF-8 values if (Match("|G|G")==0) { // UTF-8 code already there #103=1 Char(2) } if (Match("[\s\t]+[^0-9a-fA-F][^0-9a-fA-F]", REGEXP)==0) { // no hex values there if (#103==1) { // get 8859 and UTF-8 from binary // write (insert) hex of 8859 and UTF-8 } else { // get 8859 from binary // write (insert) hex of 8859 // UTF-8 values still need to be done manually } continue } if (Match("[\s\t]+[0-9a-fA-F][0-9a-fA-F][\s\t]+[0-9a-fA-F][0-9a-fA-F][\s\t]+[0-9a-fA-F][0-9a-fA-F]", REGEXP)==0) { // all 3 hex strings there if (#103==0) { // write UTF-8 binary } continue } if (Match("[\s\t]+[0-9a-fA-F][0-9a-fA-F][\s\t]+[0-9a-fA-F][0-9a-fA-F][\s\t]+[^0-9a-fA-F][^0-9a-fA-F]", REGEXP)!=0) { // if 2 hex strings only // get 8859 from binary // write (insert) hex of 8859 if (#103==0) { // write UTF-8 binary } continue } if (Match("[\s\t]+[0-9a-fA-F][0-9a-fA-F][\s\t]+[^0-9a-fA-F][^0-9a-fA-F]", REGEXP)==0) { // only one hex string (8859) if (#103==1) { // get UTF-8 from binary // write hex of UTF-8 } else { // UTF-8 values still need to be done manually } } } return //------------------------------------------------------------------------ // 1. from 8859 to UTF-8, per table :ISO_UTF8: Buf_Switch(#90) BoF Repeat (ALL) { Search("|G", NOERR+ERRBREAK) // only for characters with high-bit set Update #105=Cur_Char Reg_Copy_Block(105, Cur_Pos, Cur_Pos+1) Buf_Switch(#91) // the table #103=1 // start value in case char is found (no Error_Match below) Search("|<|@(105)", BEGIN+CASE+ADVANCE+NOERR) if (Error_Match) { ItoA(#105, 106, LEFT+HEX+NOMSG+NOCR) // Hex value of CurChar Call("Ask4UTF8") // ret: #103=1 if value has been typed in } while (#103==1) { // no real loop, but allow the Call in the else part Buf_Switch(#91) // the table if (Match("|G|G")==0) { // if there are two high-bit-set bytes following (assumed to be the UTF-8 code) #104=Cur_Char #105=Cur_Char(1) Buf_Switch(#90) // the original text Ins_Char(#104, OVERWRITE) // replace the original 8859 character with two bytes UTF-8 Ins_Char(#105) break } else { // no UTF-8 bytes in table yet Call("Ask4UTF8") } } Buf_Switch(#90) // the original text if (#103==0) { Char(1) } // to do: better way } return :Ask4UTF8: // // in: @105 8859 char // @106 " as hex string // // out: #103 = 1: o.k =0 : canceled or no input done // #104 value of UTF-8 (by user input) (unused) // @104 hex string of UTF-8 (unused) // Cursor at searched (new) position #103=Dialog_Input_1(104,"`UTF-8 Translation`, `Unknown ISO-character: |@(105) (0x|@(106))`, `??2 Bytes UTF-8 in hex:`, `[&Translate]`,`[Skip]`",SET+APP+CENTER,0,0) if (Reg_Size(104)==0) { #103=0 // needed also as return flag } if (#103 == 1) { // if [ok] and input there BoL if (Match(@105)==0) { // if char already there, but no UTF-8 bytes following yet Char(1) } else { EoF if (! At_EoL) { Ins_NewLine(1) } Reg_Ins(105) // 8859 character } // build the binary values of UTF-8 bytes Reg_Ins(104, BEGIN) // hex of UTF-8 (as by user input) Save_Pos() Replace_Block("|W", "", Cur_Pos, Cur_Pos+1+Reg_Size(104), NOERR+ALL) Replace_Block("0x", "", Cur_Pos, Cur_Pos+1+Reg_Size(104), NOERR+ALL) Restore_Pos() Ins_Text("0x") Char(-2) #104=Num_Eval(104) // => the numerical value of 2-byte UTF-8 Del_Char(Reg_Size(104)+2) Ins_Char((#104 & 0xFF00)>>8, LEFT+NOCR) // insert binary UTF-8 bytes Ins_Char(#104 & 0x00FF, LEFT+NOCR) Ins_Text(" ") Num_Ins(#105 , HEX+NOMSG+NOCR) // hex value of 8859 char Ins_Text(" ") Num_Ins((#104 & 0xFF00)>>8 , HEX+NOMSG+NOCR) // hex vaules of UTF-8 char Ins_Text(" ") Num_Ins(#104 & 0x00FF , HEX+NOMSG+NOCR) Ins_Text(" ") Ins_NewLine(1) Line(-1) Char(1) File_Save(NOMSG) } else { #103=0 } return //-------------------------------------------------------------------------- // 2. from UTF-8 to 8859, also per table, restricted to two byte codes :UTF8_ISO: // to do: more reliable! Buf_Switch(#90) // the original text BoF Repeat (ALL) { Search("|G", NOERR+ERRBREAK) // only for characters with high-bit set #104=Cur_Char #105=Cur_Char(1) if ( ((#104 & 0xE0) == 0xC0 ) && ((#105 & 0xC0) == 0x80) ) { // 110x xxxx 10xx xxxx Reg_Copy_Block(105, Cur_Pos, Cur_Pos+2) // binary UTF-8 #103=1 Buf_Switch(#91) // the table Search("|<|G|@(105)", BEGIN+ADVANCE+NOERR) if (Error_Match) { ItoA(#104, 106, LEFT+HEX+NOMSG+NOCR) // Hex values of UTF-8 ItoA(#105, 106, LEFT+HEX+NOMSG+NOCR+APPEND) Call("Ask4ISO") // ret: #103=1 if value has been typed in and now in table } if (#103==1) { BoL #106=Cur_Char Buf_Switch(#90) Ins_Char(#106, OVERWRITE) Del_Char(1) } else { Buf_Switch(#90) Char(2) // 1 or 2 ? } } else { // error! or bigger UTF-8 character Char(1) } } return :Ask4ISO: // // in: @105 UTF-8 chars // @106 " as hex string // // out: #103 = 1: o.k =0 : canceled or no input done // #104 value of ISO char (by user input) (unused) // @104 hex string of ISO char (unused) // Cursor at searched (new) position #103=Dialog_Input_1(104,"`UTF-8 Translation`, `Unknown UTF-8-character: |@(105) (0x|@(106))`, `??1 Byte ISO as char or in hex:`, `[&Translate]`,`[Skip]`",SET+APP+CENTER,0,0) if (Reg_Size(104)==0) { #103=0 // needed also as return flag } if (#103 == 1) { // if [ok] and input there EoF if (! At_EoL) { Ins_NewLine(1) } Reg_Ins(105) // UTF-8 characters BoL // build the binary value of ISO char Reg_Ins(104, BEGIN) // user input if (Match("|G")==0) { //ready } else { Replace_Block("0x", "", Cur_Pos, Cur_Pos+Reg_Size(104), NOERR+ALL) if (Match("[0-9a-fA-F][0-9a-fA-F]", REGEXP)==0) { Ins_Text("0x") Char(-2) #104=Num_Eval(104) // => the numerical value of 2-byte UTF-8 Del_Char(Reg_Size(104)+2) Ins_Char((#104 & 0xFF00)>>8, LEFT+NOCR) // insert binary UTF-8 bytes Ins_Char(#104 & 0x00FF, LEFT+NOCR) Ins_Text(" ") Num_Ins(#105 , HEX+NOMSG+NOCR) // hex value of 8859 char Ins_Text(" ") Num_Ins((#104 & 0xFF00)>>8 , HEX+NOMSG+NOCR) // hex vaules of UTF-8 char Ins_Text(" ") Num_Ins(#104 & 0x00FF , HEX+NOMSG+NOCR) Ins_Text(" ") Ins_NewLine(1) Line(-1) Char(1) File_Save(NOMSG) } else { // error invalid input #103=0 } } } else { #103=0 } return //--------------------------------------------------------------------------