// UTF-CHECK.VDM - check a file for Unicode (UTF-16) and its endianess (byte order) // // Last change: 24 Jul 2004 by Christian Ziemski // // // To do: more tests and optimization // //---------------------------------------------------------------------------------------- // // Requires: VEDIT for Windows 6.02 or later. // //---------------------------------------------------------------------------------------- // // Numeric Register Usage: // // #100 remember best guess // #101 remember best guess' endianess // #121 remember best guess' file type // #103 endianess via BOM (0=no BOM, 1=big endian, 2= little endian) // #104 endianess to guess // #105 counter // #106 guessed file type // // 103 result as text for displaying // //---------------------------------------------------------------------------------------- // Check whether this file is possibly a Unicode UTF-16 file and // determine its byte-order (endianess) if possible // - via (optional) BOM // - or at least 5 lines with UTF-16 CR/LF // Save_Pos() Begin_Of_File #104=0 // endianess to guess #106=-1 if (match("|HFE|HFF")==0) { // big-endian BOM #103=1 } else { if (match("|HFF|HFE")==0) { // little-endian BOM #103=2 } else { // no UTF BOM found (This is no error: Unicode does not require BOM) // so try to guess the endianess (not foolproof!) #103=0 // No BOM // first try #106=0 // guessed file-type CR/LF for (#105=0; #105<5; #105++) { // Check at least 5 lines with UTF-16 CR/LF Search("|H0D",NOERR+ERRBREAK) if (Cur_Pos&1) { // at odd byte position? Char(-1) // back one byte to be on even position (character boundary) if(Match("|H00|H0D|H00|H0A", ADVANCE)==0) { // CR/LF in big-endian format if(#104==2) { // but was already guessed to little-endian! break } #104=1 // big-endian } else { break } } else { // at even byte position if(Match("|H0D|H00|H0A|H00", ADVANCE)==0) { // CR/LF in little-endian format if(#104==1) { // but was already guessed to big-endian! break } #104=2 // little-endian } else { break } } } #100=#105 #101=#104 #121=#106 if (#105<5) { // not enough correct CR/LF lines found. Try it with LF's alone (UNIX format) // second try Begin_Of_File #104=0 #106=1 // guessed file-type LF for (#105=0; #105<5; #105++) { // Check at least 5 lines with UTF-16 LF Search("|H0A",NOERR+ERRBREAK) if (Cur_Pos&1) { // at odd byte position? Char(-1) // back one byte to be on even position (character boundary) if(Match("|H00|H0A", ADVANCE)==0) { // LF in big-endian format if(#104==2) { // but was already guessed to little-endian! break } #104=1 // big-endian } else { break } } else { // at even byte position if(Match("|H0A|H00", ADVANCE)==0) { // LF in little-endian format if(#104==1) { // but was already guessed to big-endian! break } #104=2 // little-endian } else { break } } } } if (#105 > #100) { #100=#105 #101=#104 #121=#106 } if (#105<5) { // still not enough correct lines found. Try it with CR's alone (Mac format) // third try Begin_Of_File #104=0 #106=2 // guessed file-type CR for (#105=0; #105<5; #105++) { // Check at least 5 lines with UTF-16 CR Search("|H0D",NOERR+ERRBREAK) if (Cur_Pos&1) { // at odd byte position? Char(-1) // back one byte to be on even position (character boundary) if(Match("|H00|H0D", ADVANCE)==0) { // CR in big-endian format if(#104==2) { // but was already guessed to little-endian! break } #104=1 // big-endian } else { break } } else { // at even byte position if(Match("|H0D|H00", ADVANCE)==0) { // CR in little-endian format if(#104==1) { // but was already guessed to big-endian! break } #104=2 // little-endian } else { break } } } } if (#105 < #100) { // take best of the three tries #105=#100 #104=#101 #106=#121 } } } // display the result Reg_Set (103, "The file is not an UTF-16 file") if (#103==1) { Reg_Set(103, "The file is big-endian per BOM") } else { if (#103==2) { Reg_Set(103, "The file is little-endian per BOM") } else { if(#105<5) { // if not at least 5 lines with correct line ends // #106=-1 if (#105==0) { Reg_Set(103, "No lines with a correct line-end. The file seems to be ") } else { Reg_Set(103, "Only ") Num_Str(#105, 103, LEFT+NOCR+APPEND) Reg_Set(103, " lines with a correct line-end. Unsure. The file seems to be ", APPEND) } } else { Reg_Set(103, "The file seems to be ") } if (#104==1) { Reg_Set(103, "big-endian ", APPEND) } else { if (#104==2) { Reg_Set(103, "little-endian ", APPEND) } else { Reg_Set(103, "not UTF-16 ", APPEND) #106=-1 } } if (#106==0) { Reg_Set(103, " (CR/LF)", APPEND) } else { if (#106==1) { Reg_Set(103, " (LF)", APPEND) } else { if (#106==2) { Reg_Set(103, " (CR)", APPEND) } } } } } Dialog_Input_1(121,"`Unicode Check`, `|@(103)`, `[OK]`",APP+CENTER,0,0) Restore_Pos()