# UTF-8 functions for Thompson AWK # (c) Markus Gnam 2017 Version 2017-09-17 # encoding: UTF-8 ###################################################################### # UTF-8 functions, derived from Kragen Sitaker's basic count algorithm # http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html # "My UTF-8 C Version" http://canonical.org/~kragen/strlen-utf8.html function utf8_length(my_string, starting_index, ending_index) { local i, count, arg_count = 0 arg_count = argcount() if (arg_count < 2) starting_index = 1 if (arg_count < 3) ending_index = length(my_string) if (typeof(argval(1)) == "array") { count = length(argval(1)) } else { for (i = starting_index; i <= ending_index; ++i) { if (and(ord(my_string, i), 0xC0) != 0x80) ++count } } return count } function utf8_position(my_string, position) { local i, count = 0 if (position <= 1) { i = position } else { for (i = 1; i <= length(my_string); ++i) { if (and(ord(my_string, i), 0xC0) != 0x80) { ++count if (count == position) break } } } return i } function utf8_substr(my_string, start_position, my_length) { local i, j, count = 0 local result_string = "" i = utf8_position(my_string, start_position) if (argcount() == 3) { if (my_length == 0) { result_string = "" } else { for (j = i; j <= length(my_string); ++j) { if (and(ord(my_string, j), 0xC0) != 0x80) { ++count } if (count > my_length) { break } } result_string = substr(my_string, i, j-i) } } else { result_string = substr(my_string, i) } return result_string } function utf8_index(my_string, string2, start_position) { local i, j, result_integer = 0 if (argcount() == 3) i = utf8_position(my_string, start_position) else i = 1 j = index(my_string, string2, i) if (j <= 1) result_integer = j else result_integer = utf8_length(my_string, 1, j) return result_integer } function utf8_rindex(my_string, string2, end_position) { local i, j, result_integer = 0 if (argcount() == 3) i = utf8_position(my_string, end_position) else i = length(my_string) j = rindex(my_string, string2, i) if (j <= 1) result_integer = j else result_integer = utf8_length(my_string, 1, j) return result_integer } function utf8_match(my_string, pattern, start_position, pstart, plength) { local i, j, k, result_integer, utf8_rstart, utf8_rlength, arg_count = 0 arg_count = argcount() if (arg_count >= 3) i = utf8_position(my_string, start_position) else i = 1 j = match(my_string, pattern, i, pstart, plength) if (j == 0) { result_integer = 0 utf8_rstart = 0 utf8_rlength = 0 } else { result_integer = utf8_length(my_string, 1, j) utf8_rstart = result_integer utf8_rlength = utf8_length(my_string, j, j+RLENGTH-1) } if (arg_count == 5) { pstart[0] = utf8_rstart plength[0] = utf8_rlength for (k in pstart) { if (k > 0) { pstart_k = utf8_length(my_string, 1, pstart[k]) plength_k = utf8_length(my_string, pstart[k], pstart[k]+plength[k]-1) pstart[k] = pstart_k plength[k] = plength_k } } } RSTART = utf8_rstart RLENGTH = utf8_rlength return result_integer } function utf8_splitchars(my_string, CHAR_ARRAY) { # Split my_string into an array of its characters # like split(my_string, CHAR_ARRAY, "") for bytes: local i, j, old_character_position = 0 for (i = 1; i <= length(my_string); ++i) { if (and(ord(my_string, i), 0xC0) != 0x80) { if (old_character_position > 0) CHAR_ARRAY[++j] = substr(my_string, old_character_position, i-old_character_position) old_character_position = i } } CHAR_ARRAY[++j] = substr(my_string, old_character_position, i-old_character_position) return CHAR_ARRAY } function utf8_translate(my_string, from_characters, to_characters) { local i = 0 local FROM_ARRAY, TO_ARRAY utf8_splitchars(from_characters, FROM_ARRAY) utf8_splitchars(to_characters, TO_ARRAY) for (i in FROM_ARRAY) { gsub(FROM_ARRAY[i], TO_ARRAY[i], my_string, 0) } return my_string } function utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) { local arg_count, number, length_argval, utf8_length_argval, prstart, prlength, i, j, k = 0 local new_format, pformat, new_pformat, result_string = "" local PSTART, PLENGTH, SARGS arg_count = argcount() # Most often: Two equal numbers with preceding "-" and "s", e.g. sprintf("%-13.13s", str): if ( (arg_count == 2) && (match(format, /^%-([0-9]+).$1s$/, 1, PSTART, PLENGTH)) ) { number = substr(format, PSTART[1], PLENGTH[1]) length_argval = length(arg1) utf8_length_argval = utf8_length(arg1) if (length_argval > utf8_length_argval) { if (utf8_length_argval <= number) number = number + (length_argval - utf8_length_argval) else number = number + (number - utf8_length(arg1, 1, number)) gsub(/[0-9]+/, number, format) } result_string = sprintf(format, arg1) } # Treat all other cases: else { i = 1 j = 1 new_format = format while (utf8_match(format, /%[^%]+/, j)) { ++i pformat = utf8_substr(format, RSTART, RLENGTH) prstart = RSTART prlength = RLENGTH new_pformat = pformat k = 1 if (pformat ~ /[0-9]s/) { while (utf8_match(pformat, /[0-9]+/, k)) { number = utf8_substr(pformat, RSTART, RLENGTH) length_argval = length(argval(i)) utf8_length_argval = utf8_length(argval(i)) if (length_argval > utf8_length_argval) { if (utf8_length_argval <= number) number = number + (length_argval - utf8_length_argval) else number = number + (number - utf8_length(argval(i), 1, number)) } new_pformat = utf8_replace_substring(new_pformat, RSTART, RLENGTH, number) k = RSTART + RLENGTH } } new_format = utf8_replace_substring(new_format, prstart, prlength, new_pformat) j = prstart + prlength } if (arg_count == 2) result_string = sprintf(new_format, arg1) else if (arg_count == 3) result_string = sprintf(new_format, arg1, arg2) else if (arg_count == 4) result_string = sprintf(new_format, arg1, arg2, arg3) else if (arg_count == 5) result_string = sprintf(new_format, arg1, arg2, arg3, arg4) else if (arg_count == 6) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5) else if (arg_count == 7) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6) else if (arg_count == 8) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7) else if (arg_count == 9) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) else if (arg_count == 10) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9) else if (arg_count == 11) result_string = sprintf(new_format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) else if (arg_count > 11) { SARGS[1] = new_format for (i = 2; i <= arg_count; i++) { SARGS[i] = argval(i) } result_string = calla("sprintf", SARGS) } } return result_string } function utf8_printf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) { local arg_count = 0 local result_string = "" local SARGS arg_count = argcount() if (arg_count == 2) result_string = utf8_sprintf(format, arg1) else if (arg_count == 3) result_string = utf8_sprintf(format, arg1, arg2) else if (arg_count == 4) result_string = utf8_sprintf(format, arg1, arg2, arg3) else if (arg_count == 5) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4) else if (arg_count == 6) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5) else if (arg_count == 7) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6) else if (arg_count == 8) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7) else if (arg_count == 9) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) else if (arg_count == 10) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9) else if (arg_count == 11) result_string = utf8_sprintf(format, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10) else if (arg_count > 11) { SARGS[1] = format for (i = 2; i <= arg_count; i++) { SARGS[i] = argval(i) } result_string = calla("utf8_sprintf", SARGS) } printf(result_string) } function utf8_replace_substring(string, substr_pos, substr_length, replace_with) { return utf8_substr(string,1,substr_pos-1) replace_with utf8_substr(string,substr_pos+substr_length) } ######################################################################