# UTF-8 functions for Thompson AWK
# (c) Markus Gnam 2017 Version 2017-09-17
# encoding: UTF-8

######################################################################
# UTF-8 functions, derived from Kragen Sitaker's basic count algorithm
# http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
# "My UTF-8 C Version" http://canonical.org/~kragen/strlen-utf8.html
function utf8_length(my_string, starting_index, ending_index) {
  local i, count, arg_count = 0

  arg_count = argcount()
  if (arg_count < 2) starting_index = 1
  if (arg_count < 3) ending_index = length(my_string)
  if (typeof(argval(1)) == "array") {
    count = length(argval(1))    
  }
  else {  
    for (i = starting_index; i <= ending_index; ++i) {
      if (and(ord(my_string, i), 0xC0) != 0x80)
        ++count
    }
  }  

  return count
}

function utf8_position(my_string, position) {
  local i, count = 0
  
  if (position <= 1) {
    i = position
  }
  else {  
    for (i = 1; i <= length(my_string); ++i) {
      if (and(ord(my_string, i), 0xC0) != 0x80) {
        ++count
        if (count == position)
          break
      }  
    }
  }  

  return i
}

function utf8_substr(my_string, start_position, my_length) {
  local i, j, count = 0
  local result_string = ""
  
  i = utf8_position(my_string, start_position)
  if (argcount() == 3) {
    if (my_length == 0) {
      result_string = ""
    }
    else {    
      for (j = i; j <= length(my_string); ++j) {
        if (and(ord(my_string, j), 0xC0) != 0x80) {
          ++count
        }  
        if (count > my_length) {
          break
        }
      }
      result_string = substr(my_string, i, j-i)
    }
  }
  else {
    result_string = substr(my_string, i)
  }

  return result_string
}

function utf8_index(my_string, string2, start_position) {
  local i, j, result_integer = 0
  
  if (argcount() == 3)
    i = utf8_position(my_string, start_position)
  else
    i = 1
  
  j = index(my_string, string2, i)
  if (j <= 1)
    result_integer = j
  else
    result_integer = utf8_length(my_string, 1, j)

  return result_integer
}

function utf8_rindex(my_string, string2, end_position) {
  local i, j, result_integer = 0
  
  if (argcount() == 3)
    i = utf8_position(my_string, end_position)
  else
    i = length(my_string)
  
  j = rindex(my_string, string2, i)
  if (j <= 1)
    result_integer = j
  else
    result_integer = utf8_length(my_string, 1, j)

  return result_integer
}

function utf8_match(my_string, pattern, start_position, pstart, plength) {
  local i, j, k, result_integer, utf8_rstart, utf8_rlength, arg_count = 0
   
  arg_count = argcount()
  if (arg_count >= 3)
    i = utf8_position(my_string, start_position)
  else
    i = 1
  
  j = match(my_string, pattern, i, pstart, plength)
  if (j == 0) {
    result_integer = 0
    utf8_rstart  = 0
    utf8_rlength = 0
  }  
  else {
    result_integer = utf8_length(my_string, 1, j)
    utf8_rstart = result_integer
    utf8_rlength = utf8_length(my_string, j, j+RLENGTH-1)
  }  

  if (arg_count == 5) {
    pstart[0] = utf8_rstart
    plength[0] = utf8_rlength
    for (k in pstart) {
      if (k > 0) {
        pstart_k = utf8_length(my_string, 1, pstart[k])
        plength_k = utf8_length(my_string, pstart[k], pstart[k]+plength[k]-1)
        pstart[k] = pstart_k
        plength[k] = plength_k
      }
    }   
  }
  
  RSTART = utf8_rstart
  RLENGTH = utf8_rlength  
  return result_integer
}

function utf8_splitchars(my_string, CHAR_ARRAY) {
  # Split my_string into an array of its characters
  # like split(my_string, CHAR_ARRAY, "") for bytes:
  local i, j, old_character_position = 0
  
  for (i = 1; i <= length(my_string); ++i) {
    if (and(ord(my_string, i), 0xC0) != 0x80) {
      if (old_character_position > 0)
        CHAR_ARRAY[++j] = substr(my_string, old_character_position, i-old_character_position)
      old_character_position = i
    }
  }
  CHAR_ARRAY[++j] = substr(my_string, old_character_position, i-old_character_position)
  
  return CHAR_ARRAY
}
    
function utf8_translate(my_string, from_characters, to_characters) {
  local i = 0
  local FROM_ARRAY, TO_ARRAY
  
  utf8_splitchars(from_characters, FROM_ARRAY)
  utf8_splitchars(to_characters, TO_ARRAY)

  for (i in FROM_ARRAY) {
    gsub(FROM_ARRAY[i], TO_ARRAY[i], my_string, 0)
  }

  return my_string
}

function utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) {
  local arg_count, number, length_argval, utf8_length_argval, prstart, prlength, i, j, k = 0
  local new_format, pformat, new_pformat, result_string = ""
  local PSTART, PLENGTH, SARGS

  arg_count = argcount()
  # Most often: Two equal numbers with preceding "-" and "s", e.g. sprintf("%-13.13s", str):
  if ( (arg_count == 2) && (match(format, /^%-([0-9]+).$1s$/, 1, PSTART, PLENGTH)) ) {
    number = substr(format, PSTART[1], PLENGTH[1])
    length_argval = length(arg1)
    utf8_length_argval = utf8_length(arg1)
    if (length_argval > utf8_length_argval) {
      if (utf8_length_argval <= number)
        number = number + (length_argval - utf8_length_argval)
      else
        number = number + (number - utf8_length(arg1, 1, number))
      gsub(/[0-9]+/, number, format)
    }       
    result_string = sprintf(format, arg1)
  }
  # Treat all other cases:
  else {
    i = 1
    j = 1
    new_format = format
    while (utf8_match(format, /%[^%]+/, j)) {
      ++i
      pformat = utf8_substr(format, RSTART, RLENGTH)
      prstart = RSTART
      prlength = RLENGTH
      new_pformat = pformat
      k = 1
      if (pformat ~ /[0-9]s/) {
        while (utf8_match(pformat, /[0-9]+/, k)) {
          number = utf8_substr(pformat, RSTART, RLENGTH)
          length_argval = length(argval(i))
          utf8_length_argval = utf8_length(argval(i))
          if (length_argval > utf8_length_argval) {
            if (utf8_length_argval <= number)
              number = number + (length_argval - utf8_length_argval)
            else
              number = number + (number - utf8_length(argval(i), 1, number))
          }       
          new_pformat = utf8_replace_substring(new_pformat, RSTART, RLENGTH, number)
          k = RSTART + RLENGTH
        }
      }
      new_format = utf8_replace_substring(new_format, prstart, prlength, new_pformat)
      j = prstart + prlength
    }
  
    if (arg_count == 2)
      result_string = sprintf(new_format, arg1)
    else if  (arg_count == 3)
      result_string = sprintf(new_format, arg1, arg2)
    else if  (arg_count == 4)
      result_string = sprintf(new_format, arg1, arg2, arg3)
    else if  (arg_count == 5)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4)
    else if  (arg_count == 6)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5)
    else if  (arg_count == 7)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6)
    else if  (arg_count == 8)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
    else if  (arg_count == 9)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8)
    else if  (arg_count == 10)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9)
    else if  (arg_count == 11)
      result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10)
    else if (arg_count > 11) {
      SARGS[1] = new_format
      for (i = 2; i <= arg_count; i++) {
        SARGS[i] = argval(i)
      }  
      result_string = calla("sprintf", SARGS)  
    }  
  } 
  
  return result_string
}    

function utf8_printf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) {
  local arg_count = 0
  local result_string = ""
  local SARGS

  arg_count = argcount()
  if (arg_count == 2)
    result_string = utf8_sprintf(format, arg1)
  else if  (arg_count == 3)
    result_string = utf8_sprintf(format, arg1, arg2)
  else if  (arg_count == 4)
    result_string = utf8_sprintf(format, arg1, arg2, arg3)
  else if  (arg_count == 5)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4)
  else if  (arg_count == 6)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5)
  else if  (arg_count == 7)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6)
  else if  (arg_count == 8)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
  else if  (arg_count == 9)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8)
  else if  (arg_count == 10)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9)
  else if  (arg_count == 11)
    result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10)
  else if (arg_count > 11) {
    SARGS[1] = format
    for (i = 2; i <= arg_count; i++) {
      SARGS[i] = argval(i)
    }  
    result_string = calla("utf8_sprintf", SARGS)  
  }  
  
  printf(result_string)
}    

function utf8_replace_substring(string, substr_pos, substr_length, replace_with) {
  return utf8_substr(string,1,substr_pos-1) replace_with utf8_substr(string,substr_pos+substr_length)
}
######################################################################