// UTF-ANSI.VDM - Unicode (UTF-16) to ANSI/OEM conversion of entire file.
//
//		  By: Ian Binnie and Christian Ziemski
//		       Change: 01-Sept-2004.
//		  Last Change: 07-June-2008  CZ: Little enhancement regarding output of percentage
//
// Edit usage: {EDIT, Translate, Unicode to ASCII}.
//
// Filter usage: vpw -w -s0 -q utf-ansi.vdm pathname
//
//----------------------------------------------------------------------------------------
//
// Description:	Translates Unicode UTF-16 to 8-bit ASCII with the same character set -
//		ANSI (Code Page 1252) or OEM (Code Page 437) as the current VEDIT font.
//		It can handle UTF-16LE (little-endian), usually used in Windows,
//		and UTF-16BE (big-endian) files.
//
//		If the Unicode file contains characters for which there is no mapping
//		these characters are translated to 0x7F and the number of them is
//		reported at end.
//		Optionally they are listed in an extra window for diagnostics.
//
//	For more technical infos see (for example):
//		http://www.alanwood.net/demos/ansi.html
//	    or:	ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
//
//          or: Full Unicode reference at http://www.unicode.org/charts
//
// Requires:	VEDIT for Windows 6.13 or later.
//
//----------------------------------------------------------------------------------------
//
// Numeric Register Usage:
//
//   Return (error) code (invocation parameter "-x" and/or "-q"):
//
//   #80	Exit code for small # of errors (default is 0; edit to 1, if desired;
//		 all later return codes are > #80
//   #81	Exit level #80	 if < #81% errors
//   #82	Exit level #80+1 if < #82% errors
//   #83	Exit level #80+2 if < #83% errors
//   #84	Quit, returning error level #85 if # errors ever > #84%
//		(Only used for "-q" quiet mode)
//   #85	Error Level for too many errors (10; see below)
//   #86	Corrupted translation tables error level; quit with error level 11
//   #88	File-size of ANSI file (1/2 of input file)
//
//   Other usage: (Regs 90-98 are pushed/popped)
//
//   #90	ID of Current Buffer
//   #91	ID of Translation Work Buffer
//   #92,#95	Compiling the Translation Table from its text source herein
//   #96	Flag for endianess via BOM
//   #97	Flag for endianess by guessing
//   #98	ID of buffer for not translated characters
//   #100	Edit position on entry
//   #103-#106	...
//   #121	Warning count/flag
//
//----------------------------------------------------------------------------------------

#100 = Cur_Pos		// current position

//
//	For empty file, do nothing.
//
BoF
if (At_EoF){
  if (Is_Quiet) {
    Exit(0)
  } else {
    return
  }
}
//
//	Error code levels for auto-execution mode.
//
// Note: generally, there will be no interaction with other macros when run
//	 "auto-execution"; thus, there is no need to save numeric registers.
//
if (Is_Auto_Execution && Macro_Num == 100) {
  #80 = 0	// Default error level for "just a few" number of unknown chars;
		//  set to 1 for "no errors allowed";
		//  higher error levels are based on this value
  #81 = 2	// Percentage for "just a few" number of errors
  #82 = 10	// 10%
  #83 = 15	// 15%
  #84 = 33	// Percentage to "abort" the translation process at the
		//  instance it occurs. ("Quits" the conversion,
		//  returning error level #85).
  #85 = 10	// Error level for "too many errors"
  #86 = 11	// Error level for corrupted translation table
  #88 = File_Size >> 1	// Size of target file = 1/2 input file
}
//
//	Determine whether ANSI or OEM character mode.
//
if (Font_Charset == 0) {
  Reg_Set(105,"ANSI")
} else {
  Reg_Set(105,"OEM")
}
//
// Check whether this file is possibly a Unicode UTF-16 file and
// determine its byte-order (endianess) if possible
// - via (optional) BOM
// - or by counting 0x00-bytes at odd and even byte-positions
//

Begin_Of_File

#103=0				// endianess via BOM (1 = big, 2 = little endian)
#104=0				// endianess, guessed if no BOM present
#105=0				// counter
#106=0				// counter

if (Match("|HFE|HFF")==0) {	// big-endian BOM found
  #103=1
} else {
  if (Match("|HFF|HFE")==0) {	// little-endian BOM found
    #103=2
  } else {			// no UTF BOM found (This is no error: Unicode does not require BOM)
				// so try to guess the endianess

    Repeat(Min(50, File_Size/2)) {	// read max. 50 characters
      if (Cur_Char==0) { #105++ }	// count 0x00 at odd byte positions
      Char
      if (Cur_Char==0) { #106++ }	// count 0x00 at even byte positions
      Char
    }

    if (#105*3 > Cur_Pos && #106*5 < Cur_Pos) {
       #104 = 1				// Big endian
    } else {
      if (#106*3 > Cur_Pos && #105*5 < Cur_Pos) {
	 #104 = 2			// Little endian
      }
    }

    if(#104==0) {		// if not an UTF-16 file
      if (Is_Quiet) {
	Exit(12)		// exit macro with return code 12
      } else {
	Alert()
	#105 = Dialog_Input_1(121,"`ERROR - Unicode UTF-16 to |@(105)`,
		`This file seems to be either not Unicode\nor is a Unicode file that is not supported by VEDIT.`,
		`But you can force the translation anyway, with unknown results.`,
		`  Force LE\tassumes a byte order used on x86 architectures etc.\n  Force BE\tassumes a byte order used on many RISC architectures etc.`,
		`[&Quit]`,`[&Force LE]`,`[&Force BE]`",APP+CENTER,0,0)
	if (#105 < 2 ) {	// Quit or Esc
	  Goto_Pos(#100)
	  Return
	}
	if (#105 == 2) {	// Force little-endian
	  #104=2
	}
	if (#105 == 3) {	// Force big-endian
	  #104=1
	}
      }
    }
  }
}

//
//	Give confirmation prompt unless run via "-x" invocation option.
//
if (!(Is_Auto_Execution && Macro_Num == 100)) {
  Alert()
  #105 = Dialog_Input_1(121,"`Confirmation`,`OK to translate entire file from Unicode to |@(105)?`,`[&Yes]`,`[&No]`",APP+CENTER,0,0)
  if (#105 != 1) {
    Goto_Pos(#100)
    Return
  }
}

//
//	Prepare for the translation.
//
Num_Push(90,98)

#96=#103		    // endianess via BOM, else 0
#97=#104		    // guessed endianess if no BOM, else 0

#90=Buf_Num		    // current buffer
#91=Buf_Switch(Buf_Free)    // working buffer for translation table
Config(F_F_TYPE,0, LOCAL) // to be safe
Config(F_OVER_MODE, "Overwrite-only mode (*) (0=Off, 1=Rec, 2=All)", 0 , LOCAL)

//	The translation tables for characters corresponding to ANSI 0x80 - 0x9F
//						      or VEDIT OEM  0x80 - 0xFF
//	One line per character:
//	    Two hex bytes for the source UTF-16 character in little-endian order
//	    One hex byte for the target ANSI/OEM character
//	    An optional description

if (Font_Charset == 0) {       // ANSI Charset - Code Page 1252
  Ins_Text("
  AC 20 80  € EURO SIGN
  1A 20 82  ‚ SINGLE LOW-9 QUOTATION MARK
  92 01 83  ƒ LATIN SMALL LETTER F WITH HOOK
  1E 20 84  „ DOUBLE LOW-9 QUOTATION MARK
  26 20 85  … HORIZONTAL ELLIPSIS
  20 20 86  † DAGGER
  21 20 87  ‡ DOUBLE DAGGER
  C6 02 88  ˆ MODIFIER LETTER CIRCUMFLEX ACCENT
  30 20 89  ‰ PER MILLE SIGN
  60 01 8A  Š LATIN CAPITAL LETTER S WITH CARON
  39 20 8B  ‹ SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  52 01 8C  Œ LATIN CAPITAL LIGATURE OE
  7D 01 8E  Ž LATIN CAPITAL LETTER Z WITH CARON
  18 20 91  ‘ LEFT SINGLE QUOTATION MARK
  19 20 92  ’ RIGHT SINGLE QUOTATION MARK
  1C 20 93  “ LEFT DOUBLE QUOTATION MARK
  1D 20 94  ” RIGHT DOUBLE QUOTATION MARK
  22 20 95  • BULLET
  13 20 96  – EN DASH
  14 20 97  — EM DASH
  DC 02 98  ˜ SMALL TILDE
  22 21 99  ™ TRADE MARK SIGN
  61 01 9A  š LATIN SMALL LETTER S WITH CARON
  3A 20 9B  › SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  53 01 9C  œ LATIN SMALL LIGATURE OE
  7E 01 9E  ž LATIN SMALL LETTER Z WITH CARON
  78 01 9F  Ÿ LATIN CAPITAL LETTER Y WITH DIAERESIS
  ")
} else {			// OEM Charset CodePage 437
  Ins_Text("
  C7 00 80  € Latin capital letter C with cedilla
  FC 00 81   Latin small letter u with diaeresis
  E9 00 82  ‚ Latin small letter e with acute
  E2 00 83  ƒ Latin small letter a with circumflex
  E4 00 84  „ Latin small letter a with diaeresis
  E0 00 85  … Latin small letter a with grave
  E5 00 86  † Latin small letter a with ring above
  E7 00 87  ‡ Latin small letter c with cedilla
  EA 00 88  ˆ Latin small letter e with circumflex
  EB 00 89  ‰ Latin small letter e with diaeresis
  E8 00 8A  Š Latin small letter e with grave
  EF 00 8B  ‹ Latin small letter i with diaeresis
  EE 00 8C  Œ Latin small letter i with circumflex
  EC 00 8D   Latin small letter i with grave
  C4 00 8E  Ž Latin capital letter A with diaeresis
  C5 00 8F   Latin capital letter A with ring above
  C9 00 90   Latin capital letter E with acute
  E6 00 91  ‘ Latin small letter ae
  C6 00 92  ’ Latin capital letter AE
  F4 00 93  “ Latin small letter o with circumflex
  F6 00 94  ” Latin small letter o with tilde
  F2 00 95  • Latin small letter o with grave
  FB 00 96  – Latin small letter u with circumflex
  F9 00 97  — Latin small letter u with grave
  FF 00 98  ˜ Latin small letter y with diaeresis
  D6 00 99  ™ Latin capital letter O with diaeresis
  DC 00 9A  š Latin capital letter U with diaeresis
  A2 00 9B  › cent sign
  A3 00 9C  œ pound sign
  A5 00 9D   currency sign
  A7 20 9E  ž peseta sign
  92 01 9F  Ÿ Latin small letter f with hook
  E1 00 A0    Latin small letter a with acute
  ED 00 A1  ¡ Latin small letter i with acute
  F3 00 A2  ¢ Latin small letter o with acute
  FA 00 A3  £ Latin small letter u with acute
  F1 00 A4  ¤ Latin small letter n with tilde
  D1 00 A5  ¥ Latin capital letter N with tilde
  AA 00 A6  ¦ feminine ordinal indicator
  BA 00 A7  § masculine ordinal indicator
  BF 00 A8  ¨ inverted question mark
  10 23 A9  © reversed not sign
  AC 00 AA  ª not sign
  BD 00 AB  « vulgar fraction one half
  BC 00 AC  ¬ vulgar fraction one quarter
  A1 00 AD  ­ inverted exclamation mark
  AB 00 AE  ® left-pointing double angle quotation mark
  BB 00 AF  ¯ right-pointing double angle quotation mark
  91 25 B0  ° light shade
  92 25 B1  ± medium shade
  93 25 B2  ² dark shade
  02 25 B3  ³ box drawings light vertical
  24 25 B4  ´ box drawings light vertical and left
  61 25 B5  µ box drawings vertical single and left double
  62 25 B6  ¶ box drawings vertical double and left single
  56 25 B7  · box drawings down double and left single
  55 25 B8  ¸ box drawings down single and left double
  63 25 B9  ¹ box drawings double vertical and left
  51 25 BA  º box drawings double vertical
  57 25 BB  » box drawings double down and left
  5D 25 BC  ¼ box drawings double up and left
  5C 25 BD  ½ box drawings up double and left single
  5B 25 BE  ¾ box drawings up single and left double
  10 25 BF  ¿ box drawings light down and left
  14 25 C0  À box drawings light up and right
  34 25 C1  Á box drawings light up and horizontal
  2C 25 C2  Â box drawings light down and horizontal
  1C 25 C3  Ã box drawings light vertical and right
  00 25 C4  Ä box drawings light horizontal
  3C 25 C5  Å box drawings light vertical and horizontal
  5E 25 C6  Æ box drawings vertical single and right double
  5F 25 C7  Ç box drawings vertical double and right single
  5A 25 C8  È box drawings double up and right
  54 25 C9  É box drawings double down and right
  69 25 CA  Ê box drawings double up and horizontal
  66 25 CB  Ë box drawings double down and horizontal
  60 25 CC  Ì box drawings double vertical and right
  50 25 CD  Í box drawings double horizontal
  6C 25 CE  Î box drawings double vertical and horizontal
  67 25 CF  Ï box drawings up single and horizontal double
  68 25 D0  Ð box drawings up double and horizontal single
  64 25 D1  Ñ box drawings down single and horizontal double
  65 25 D2  Ò box drawings down double and horizontal single
  59 25 D3  Ó box drawings up double and right single
  58 25 D4  Ô box drawings up single and right double
  52 25 D5  Õ box drawings down single and right double
  53 25 D6  Ö box drawings down double and right single
  6B 25 D7  × box drawings vertical double and horizontal single
  6A 25 D8  Ø box drawings vertical single and horizontal double
  18 25 D9  Ù box drawings light up and left
  0C 25 DA  Ú box drawings light down and right
  88 25 DB  Û full block
  84 25 DC  Ü lower half block
  8C 25 DD  Ý left half block
  90 25 DE  Þ right half block
  80 25 DF  ß upper half block
  B1 03 E0  à Greek small letter alpha
  DF 00 E1  á Latin small letter sharp s
  93 03 E2  â Greek capital letter Gamma
  C0 03 E3  ã Greek small letter pi
  A3 03 E4  ä Greek capital letter Sigma
  C3 03 E5  å Greek small letter sigma
  B5 00 E6  æ micro sign
  C4 03 E7  ç Greek small letter tau
  A6 03 E8  è Greek capital letter Phi
  98 03 E9  é Greek capital letter Theta
  A9 03 EA  ê Greek capital letter Omega
  B4 03 EB  ë Greek small letter beta
  1E 22 EC  ì infinity
  C6 03 ED  í Greek small letter phi
  B5 03 EE  î Greek small letter gamma
  29 22 EF  ï intersection
  61 22 F0  ð identical to
  B1 00 F1  ñ plus-minus sign
  65 22 F2  ò greater-than or equal to
  64 22 F3  ó less-than or equal to
  20 23 F4  ô top half integral
  21 23 F5  õ bottom half integral
  F7 00 F6  ö Latin small letter o with diaeresis
  48 22 F7  ÷ almost equal to
  B0 00 F8  ø degree sign
  19 22 F9  ù bullet operator
  B7 00 FA  ú middle dot
  1A 22 FB  û square root
  7F 20 FC  ü superscript Latin small letter n
  B2 00 FD  ý superscript two
  A0 25 FE  þ black square
  A0 00 FF  ÿ no-break space
  ")
}

//
// Go to end of first line and determine and set file type
// (necessary to be safe under some circumstances)
Begin_Of_File
#103 = 0
if (Search("|{|H0D,|H0A}",NOERR)) {
  if (Match("|H0D|H0A")==0) { #103 = 0 }
  else {
    if (Match("|H0A")==0) { #103 = 1 }
    else { if (Match("|H0D")==0) { #103 = 2 }
    }
  }
}
Config(F_F_TYPE,#103, LOCAL)

//
//	Convert the translation table into binary.
//
Replace("|<|[|W]|N","",BEGIN+ALL+NOERR)		// remove empty lines
Replace("|<|W","",BEGIN+ALL+NOERR)		// remove leading whitespace

Begin_Of_File
while ( ! At_EoF) {
  for (#95=1 ; #95<=3 ; #95++) {	// three hex values per line are converted to binary
    #92=0				// one byte
    for (#103=1; #103>=0 ; #103--) {	// two nibbles per byte, high to low
      if ( Cur_Char >= '0' && Cur_Char <= '9' ) {
	#105 = Cur_Char - '0'
      } else {
	#104=Cur_Char&0xDF
	if ( #104 >= 'A' && #104 <= 'F' ) {
	#105 = 10 + #104 - 'A'
      } else {
	if (Is_Quiet) {
	  QALLY(#86)
	} else {
	  Alert()
	  Message("*****Bad input:\n")
	  Type(0)
	  Type_Newline()
	  Type(1)
	  Get_Key("Press any key to quit...")
	  Num_Pop(90,98)
	  return
      }}}
      #92 = #92 + (#105 << 4*#103)	// Build the byte
      Del_Char(1)			// Delete current nibble
    }
    Ins_Char(#92)			// Insert byte
    if ( (#95<=2) && (Match("|X")==0) ) { Del_Char(Chars_Matched) }  // Delete intermediate whitespace
  }
  Del_Block(Cur_Pos, EoL_Pos)		// delete rest of line

  if ((#96==1)||(#97==1)) {		// if big-endian required
    Begin_Of_Line		     	//   switch first two bytes
    #92=Cur_Char
    Del_Char(1)
    Char(1)
    Ins_Char(#92)
  }
  Line(1,NOERR+ERRBREAK)		// Next character in table
}

File_Open("|(VEDIT_TEMP)\utf-ansi.err", NOMSG+NOEVENT) // Buffer for not translated characters (for debugging etc.)
#98=Buf_Num
Del_Block(0, File_Size)

Config(F_F_TYPE, 0, LOCAL)
Config(F_OVER_MODE, 0 , LOCAL)

// Now translate the Unicode codepoints using the table above

Buf_Switch(#90)			// Back to the original text
Begin_Of_File()
Config(F_F_TYPE, 0, LOCAL)	// set file type to disable possible overwrite-only mode
Config(F_OVER_MODE, 0 , LOCAL)

if (#96) { Del_Char(2) }	// Delete optional BOM

#121=0			// Warning flag for incomplete success of conversion

//
// T-Reg(104) holds the special character conversion routine (dependent on byte order)
// (speed optimized by using abbreviated commands)
//
if ((#96==1)||(#97==1)) {	 // if big-endian order
Reg_Set(104,`
RCB(103,CP,CP+2)
BS(#91)
S("|@(103)",BEGIN+CASE+ADVANCE+NOERR)
if (!EM) {
  #105=CC
  BS(#90)
  IC(#105,OVERWRITE)
  DC
  return
}
#121++
if (IsQuiet) {
  if ((#121*100/#88)>#84){QALLY(#85)}
}
BS(#98)
RI(103)
IN
BS(#90)
IC(127,OVERWRITE)
DC
`)
} else {			// little-endian order
Reg_Set(104,`
RCB(103,CP-1,CP+1)
BS(#91)
S("|@(103)",BEGIN+CASE+ADVANCE+NOERR)
if (!EM) {
  #105=CC
  BS(#90)
  C(-1)
  IC(#105,OVERWRITE)
  return
}
#121++
if (IsQuiet) {
  if ((#121*100/#88)>#84){QALLY(#85)}
}
BS(#98)
RI(103)
IN
BS(#90)
C(-1)
IC(127,OVERWRITE)
`)
}

//
// The main loop is speed optimized by using abbreviated commands
// and no whitespace. (Harder to read, but faster.)
// If Font is ANSI:
//   check every second byte
//     If it's not a |H00: check full Unicode character via conversion table
//	  if found there: convert
//	  else increment #121 as warning counter
//     In any case, then delete every second byte
// If Font is OEM:
//   check 8th bit of first byte and every second byte
//     If it's not a |H000: check full Unicode character via conversion table
//	  if found there: convert
//	  else increment #121 as warning counter
//     In any case, then delete every second byte


if ((#96==1)||(#97==1)) {		// if big-endian file
  if (Font_Charset==0) {
    while (! At_EOF) {if(CC){call(104)}else{DC C}}
  } else {
    while (! At_EOF) {if(CC||(CC(-1)&0x80)){call(104)}else{DC C}}
  }
} else {				// if little-endian file
  if (Font_Charset==0) {
    while (! At_EOF) {C if(CC){call(104)} DC}
  } else {
    while (! At_EOF) {C if(CC||(CC(-1)&0x80)){call(104)} DC}
  }
}

Buf_Switch(#91)		// working buffer with conversion table
Buf_Quit(OK)		// close it
Buf_Switch(#98)		// buffer with not translated characters
if (File_Size > 0) {
  BoF
  Ins_Text("The following (UTF-16")
  if ((#96==1)||(#97==1)) {	// if big-endian file
    Ins_Text("BE")
  } else {
    Ins_Text("LE")
  }
  Ins_Text(") characters couldn't be translated to ")
  Reg_Ins(105)
  Ins_Text(":")
  Ins_Newline(1)
  // convert the UTF-byte-pairs to hex...
  while (!At_EOF)
    {
    for (#95=1 ; #95<=2 ; #95++) {	// 2 values per line are converted to hex
      #105 = (Cur_Char & 0xf0) >> 4
      #106 = Cur_Char & 0xf
      if(#105 > 9)	{ #105 = 'A' + (#105 - 10) }
      else		{ #105 += '0' }
      Ins_Char(#105,OVERWRITE)
      if(#106 > 9)	{ #106 = 'A' + (#106 - 10) }
      else		{ #106 += '0' }
      Ins_Char(#106)
      Ins_Char(32)
      }
      Line(1)
    }
  File_Save(NOMSG)
} else {
  File_Delete("|(VEDIT_TEMP)\utf-ansi.err", OK+NOERR)
}
Buf_Quit(OK)
Buf_Switch(#90)		// back to original buffer

Num_Pop(90,98)		// restore used registers

#105 = File_Size
#104 = #121*100/#105	// #104 = % (percentage) of unknown characters

//
// If run via "-x" invocation option, save file and exit.
// Determine return code based on percentage of unknown characters
//
if (Is_Auto_Execution && Macro_Num == 100) {
  #103 = 0
  if (#121>0) {
    if (#104>#81) { #103=#80 }
    if (#104>#82) { #103=#80+1 }
    if (#104>#83) { #103=#80+2 }
  }
  Xall(#103)
}

//
// Go to end of first line and determine file type
//
Begin_Of_File
#103 = 64
if (Search("|{|H0D,|H0A}",NOERR)) {
  if (Match("|H0D|H0A")==0) { #103 = 0 }
  else {
    if (Match("|H0A")==0) { #103 = 1 }
    else { if (Match("|H0D")==0) { #103 = 2 }
    }
  }
}
Config(F_F_TYPE,#103, LOCAL)

//
// If file size < 1 meg, restore (approximate) cursor position.
//
if (File_Size < 1000000) { Goto_Pos(#100/2 -1) }

//
// If there were unknown Unicode characters, report how many and percentage
// and optionally show them.
//
if (#121) {
  Update()
  Num_Str(#121, 103, LEFT)
  Num_Str(#104, 106, LEFT)
  if (#104 < 1) {
     Reg_Set(106, "< 1")
  }
  #103=Dialog_Input_1(104, "`ERROR - Unicode to |@(105)`,
  `This file contained |@(103) Unicode character(s)  (|@(106)%) \nthat could not be translated to |@(105).`,
  `[OK]`,`[Show characters]`",
  APP+CENTER,0,0)
  if (#103==2) {
    File_Open("|(VEDIT_TEMP)\utf-ansi.err")
  }
}

// Done.